File Coverage

blib/lib/Pod/Simple/BlackBox.pm
Criterion Covered Total %
statement 806 1007 80.0
branch 368 526 69.9
condition 132 189 69.8
subroutine 62 65 95.3
pod 0 7 0.0
total 1368 1794 76.2


line stmt bran cond sub pod time code
1             package Pod::Simple::BlackBox;
2             #
3             # "What's in the box?" "Pain."
4             #
5             ###########################################################################
6             #
7             # This is where all the scary things happen: parsing lines into
8             # paragraphs; and then into directives, verbatims, and then also
9             # turning formatting sequences into treelets.
10             #
11             # Are you really sure you want to read this code?
12             #
13             #-----------------------------------------------------------------------------
14             #
15             # The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16             # of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17             # to call the proper callbacks on the treelets.
18             #
19             # Every node in a treelet is a ['name', {attrhash}, ...children...]
20              
21 68     68   2997 use integer; # vroom!
  68         278  
  68         429  
22 68     68   2028 use strict;
  68         136  
  68         1243  
23 68     68   300 use warnings;
  68         122  
  68         1444  
24 68     68   331 use Carp ();
  68         138  
  68         12931  
25             our $VERSION = '3.45';
26             #use constant DEBUG => 7;
27              
28             sub my_qr ($$) {
29              
30             # $1 is a pattern to compile and return. Older perls compile any
31             # syntactically valid property, even if it isn't legal. To cope with
32             # this, return an empty string unless the compiled pattern also
33             # successfully matches $2, which the caller furnishes.
34              
35 624     624 0 1733 my ($input_re, $should_match) = @_;
36             # XXX could have a third parameter $shouldnt_match for extra safety
37              
38 624 50       4013 my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39              
40 68     68   569 my $re = eval "no warnings; $use_utf8 qr/$input_re/";
  68     68   158  
  68     68   3001  
  68     68   447  
  68     68   149  
  68     68   4596  
  68     68   45390  
  68     68   2485  
  68     68   1186  
  68     68   1732  
  68     3   202  
  68     3   2400  
  68     3   418  
  68     3   140  
  68         2319  
  68         445  
  68         139  
  68         2413  
  68         417  
  68         138  
  68         2548  
  68         414  
  68         136  
  68         2315  
  68         401  
  68         134  
  68         2357  
  68         468  
  68         139  
  68         3826  
  3         40  
  3         8  
  3         147  
  3         17  
  3         7  
  3         126  
  3         17  
  3         6  
  3         106  
  3         15  
  3         6  
  3         100  
  624         36824  
41             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
42 624 50       2444 return "" if $@;
43              
44 68     68   905 my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
  68     68   151  
  68     68   2934  
  68     68   545  
  68     68   146  
  68     68   3363  
  68     68   464  
  68     68   157  
  68     68   2599  
  68     3   458  
  68     3   134  
  68     3   2313  
  68     3   435  
  68         133  
  68         2547  
  68         425  
  68         150  
  68         2383  
  68         422  
  68         129  
  68         2574  
  68         419  
  68         132  
  68         2399  
  68         414  
  68         138  
  68         2988  
  3         19  
  3         5  
  3         124  
  3         22  
  3         6  
  3         127  
  3         25  
  3         6  
  3         104  
  3         19  
  3         10  
  3         110  
  624         34876  
45             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
46 624 50       2417 return "" if $@;
47              
48             #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49 624 50       2180 return $re if $matches;
50              
51             #print STDERR __LINE__, ": $re: didn't match\n";
52 0         0 return "";
53             }
54              
55             BEGIN {
56 68     68   1933 require Pod::Simple;
57 68 50       5946 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58             }
59              
60             # Matches a character iff the character will have a different meaning
61             # if we choose CP1252 vs UTF-8 if there is no =encoding line.
62             # This is broken for early Perls on non-ASCII platforms.
63             my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64             $non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65              
66             # Use patterns understandable by Perl 5.6, if possible
67 68     68   508 my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
  68         190  
  68         68600  
68             my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69             # to get assigned
70             my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71             "\x{250}");
72             $rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73              
74 68     68   428 my $script_run_re = eval 'no warnings "experimental::script_run";
  68         129  
  68         16734  
75             qr/(*script_run: ^ .* $ )/x';
76             my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77             unless ($latin_re) {
78             # This was machine generated to be the ranges of the union of the above
79             # three properties, with things that were undefined by Unicode 4.1 filling
80             # gaps. That is the version in use when Perl advanced enough to
81             # successfully compile and execute the above pattern.
82             $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83             }
84              
85             my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86              
87             # Latin script code points not in the first release of Unicode
88             my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89              
90             # If this perl doesn't have the Deprecated property, there's only one code
91             # point in it that we need be concerned with.
92             my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93             $deprecated_re = qr/\x{149}/ unless $deprecated_re;
94              
95             my $utf8_bom;
96             if (($] ge 5.007_003)) {
97             $utf8_bom = "\x{FEFF}";
98             utf8::encode($utf8_bom);
99             } else {
100             $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101             }
102              
103             # This is used so that the 'content_seen' method doesn't return true on a
104             # file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105             # there is a valid =foo line will we return that content was seen.
106             my $seen_legal_directive = 0;
107              
108             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109              
110 0     0 0 0 sub parse_line { shift->parse_lines(@_) } # alias
111              
112             # - - - Turn back now! Run away! - - -
113              
114             sub parse_lines { # Usage: $parser->parse_lines(@lines)
115             # an undef means end-of-stream
116 9477     9477 0 14576 my $self = shift;
117              
118 9477         14416 my $code_handler = $self->{'code_handler'};
119 9477         13969 my $cut_handler = $self->{'cut_handler'};
120 9477         12926 my $wl_handler = $self->{'whiteline_handler'};
121 9477   100     19384 $self->{'line_count'} ||= 0;
122              
123 9477         12029 my $scratch;
124              
125             DEBUG > 4 and
126 9477         11683 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127              
128 9477         11440 DEBUG > 5 and
129             print STDERR "# About to parse lines: ",
130             join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131              
132 9477   100     20197 my $paras = ($self->{'paras'} ||= []);
133             # paragraph buffer. Because we need to defer processing of =over
134             # directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135             # to process this.
136              
137 9477   100     19034 $self->{'pod_para_count'} ||= 0;
138              
139             # An attempt to match the pod portions of a line. This is not fool proof,
140             # but is good enough to serve as part of the heuristic for guessing the pod
141             # encoding if not specified.
142 9477         12786 my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
  90736         187603  
  9477         47050  
143 9477         46599 my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x;
144              
145 9477         15128 my $line;
146 9477         16761 foreach my $source_line (@_) {
147 19029 50       39996 if( $self->{'source_dead'} ) {
148 0         0 DEBUG > 4 and print STDERR "# Source is dead.\n";
149 0         0 last;
150             }
151              
152 19029 100       37396 unless( defined $source_line ) {
153 890         1300 DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154              
155 890         2896 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156 890         2511 push @$paras, $paras->[-1], $paras->[-1];
157             # So that it definitely fills the buffer.
158 890         1697 $self->{'source_dead'} = 1;
159 890         2376 $self->_ponder_paragraph_buffer;
160 890         1717 next;
161             }
162              
163              
164 18139 100       33846 if( $self->{'line_count'}++ ) {
165 17253         37011 ($line = $source_line) =~ tr/\n\r//d;
166             # If we don't have two vars, we'll end up with that there
167             # tr/// modding the (potentially read-only) original source line!
168              
169             } else {
170 886         1220 DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171              
172 886 50       5694 if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
    50          
    50          
173 0         0 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174 0         0 $self->_handle_encoding_line( "=encoding utf8" );
175 0         0 delete $self->{'_processed_encoding'};
176 0         0 $line =~ tr/\n\r//d;
177              
178             } elsif( $line =~ s/^\xFE\xFF//s ) {
179 0         0 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180             $self->scream(
181 0         0 $self->{'line_count'},
182             "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183             );
184 0         0 splice @_;
185 0         0 push @_, undef;
186 0         0 next;
187              
188             # TODO: implement somehow?
189              
190             } elsif( $line =~ s/^\xFF\xFE//s ) {
191 0         0 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192             $self->scream(
193 0         0 $self->{'line_count'},
194             "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195             );
196 0         0 splice @_;
197 0         0 push @_, undef;
198 0         0 next;
199              
200             # TODO: implement somehow?
201              
202             } else {
203 886         1723 DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204 886         2465 ($line = $source_line) =~ tr/\n\r//d;
205             }
206             }
207              
208 18139 100 100     124359 if(!$self->{'parse_characters'} && !$self->{'encoding'}
      100        
      100        
      100        
209             && ($self->{'in_pod'} || $line =~ /^=/s)
210             && $line =~ /$non_ascii_re/
211             ) {
212              
213 21         40 my $encoding;
214              
215             # No =encoding line, and we are at the first pod line in the input that
216             # contains a non-ascii byte, that is, one whose meaning varies depending
217             # on whether the file is encoded in UTF-8 or CP1252, which are the two
218             # possibilities permitted by the pod spec. (ASCII is assumed if the
219             # file only contains ASCII bytes.) In order to process this line, we
220             # need to figure out what encoding we will use for the file.
221             #
222             # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223             # 160-255, but it is used here, as it often colloquially is, to refer to
224             # the complete set of code points 0-255, including ASCII (0-127), the C1
225             # controls (128-159), and strict Latin 1 (160-255).
226             #
227             # CP1252 is effectively a superset of Latin 1, because it differs only
228             # from colloquial 8859-1 in the C1 controls, which are very unlikely to
229             # actually be present in 8859-1 files, so can be used for other purposes
230             # without conflict. CP 1252 uses most of them for graphic characters.
231             #
232             # Note that all ASCII-range bytes represent their corresponding code
233             # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234             # code points require multiple (non-ASCII) bytes to represent. (A
235             # separate paragraph for EBCDIC is below.) The multi-byte
236             # representation is quite structured. If we find an isolated byte that
237             # would require multiple bytes to represent in UTF-8, we know that the
238             # encoding is not UTF-8. If we find a sequence of bytes that violates
239             # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240             # hence must be 1252.
241             #
242             # But there are ambiguous cases where we could guess wrong. If so, the
243             # user will end up having to supply an =encoding line. We use all
244             # readily available information to improve our chances of guessing
245             # right. The odds of something not being UTF-8, but still passing a
246             # UTF-8 validity test go down very rapidly with increasing length of the
247             # sequence. Therefore we look at all non-ascii sequences on the line.
248             # If any of the sequences can't be UTF-8, we quit there and choose
249             # CP1252. If all could be UTF-8, we see if any of the code points
250             # represented are unlikely to be in pod. If so, we guess CP1252. If
251             # not, we check if the line is all in the same script; if not guess
252             # CP1252; otherwise UTF-8. For perls that don't have convenient script
253             # run testing, see if there is both Latin and non-Latin. If so, CP1252,
254             # otherwise UTF-8.
255             #
256             # On EBCDIC platforms, the situation is somewhat different. In
257             # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258             # but so do the bytes that are for the C1 controls. Recall that these
259             # correspond to the unused portion of 8859-1 that 1252 mostly takes
260             # over. That means that there are fewer code points that are
261             # represented by multi-bytes. But, note that the these controls are
262             # very unlikely to be in pod text. So if we encounter one of them, it
263             # means that it is quite likely CP1252 and not UTF-8. The net result is
264             # the same code below is used for both platforms.
265             #
266             # XXX probably if the line has E that evaluates to illegal CP1252,
267             # then it is UTF-8. But we haven't processed E<> yet.
268              
269 21 50       151 goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270              
271 21         45 my $copy;
272              
273 68     68   534 no warnings 'utf8';
  68         156  
  68         6545  
274              
275 21 50       81 if ($] ge 5.007_003) {
276 21         50 $copy = $line;
277              
278             # On perls that have this function, we can use it to easily see if the
279             # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280             # needed below for script run detection
281 21 100       344 goto set_1252 if ! utf8::decode($copy);
282             }
283             elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284             # code page doing here anyway?
285             goto set_utf8;
286             }
287             else { # ASCII, no decode(): do it ourselves using the fundamental
288             # characteristics of UTF-8
289 68     68   45073 use if $] le 5.006002, 'utf8';
  68         943  
  68         1130  
290              
291 0         0 my $char_ord;
292             my $needed; # How many continuation bytes to gobble up
293              
294             # Initialize the translated line with a dummy character that will be
295             # deleted after everything else is done. This dummy makes sure that
296             # $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297             # with upgrading in the middle
298 0         0 $copy = chr(0x100);
299              
300             # Parse through the line
301 0         0 for (my $i = 0; $i < length $line; $i++) {
302 0         0 my $byte = substr($line, $i, 1);
303              
304             # ASCII bytes are trivially dealt with
305 0 0       0 if ($byte !~ $non_ascii_re) {
306 0         0 $copy .= $byte;
307 0         0 next;
308             }
309              
310 0         0 my $b_ord = ord $byte;
311              
312             # Now figure out what this code point would be if the input is
313             # actually in UTF-8. If, in the process, we discover that it isn't
314             # well-formed UTF-8, we guess CP1252.
315             #
316             # Start the process. If it is UTF-8, we are at the first, start
317             # byte, of a multi-byte sequence. We look at this byte to figure
318             # out how many continuation bytes are needed, and to initialize the
319             # code point accumulator with the data from this byte.
320             #
321             # Normally the minimum continuation byte is 0x80, but in certain
322             # instances the minimum is a higher number. So the code below
323             # overrides this for those instances.
324 0         0 my $min_cont = 0x80;
325              
326 0 0       0 if ($b_ord < 0xC2) { # A start byte < C2 is malformed
    0          
    0          
    0          
327 0         0 goto set_1252;
328             }
329             elsif ($b_ord <= 0xDF) {
330 0         0 $needed = 1;
331 0         0 $char_ord = $b_ord & 0x1F;
332             }
333             elsif ($b_ord <= 0xEF) {
334 0 0       0 $min_cont = 0xA0 if $b_ord == 0xE0;
335 0         0 $needed = 2;
336 0         0 $char_ord = $b_ord & (0x1F >> 1);
337             }
338             elsif ($b_ord <= 0xF4) {
339 0 0       0 $min_cont = 0x90 if $b_ord == 0xF0;
340 0         0 $needed = 3;
341 0         0 $char_ord = $b_ord & (0x1F >> 2);
342             }
343             else { # F4 is the highest start byte for legal Unicode; higher is
344             # unlikely to be in pod.
345 0         0 goto set_1252;
346             }
347              
348             # ? not enough continuation bytes available
349 0 0       0 goto set_1252 if $i + $needed >= length $line;
350              
351             # Accumulate the ordinal of the character from the remaining
352             # (continuation) bytes.
353 0         0 while ($needed-- > 0) {
354 0         0 my $cont = substr($line, ++$i, 1);
355 0         0 $b_ord = ord $cont;
356 0 0 0     0 goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF;
357              
358             # In all cases, any next continuation bytes all have the same
359             # minimum legal value
360 0         0 $min_cont = 0x80;
361              
362             # Accumulate this byte's contribution to the code point
363 0         0 $char_ord <<= 6;
364 0         0 $char_ord |= ($b_ord & 0x3F);
365             }
366              
367             # Here, the sequence that formed this code point was valid UTF-8,
368             # so add the completed character to the output
369 0         0 $copy .= chr $char_ord;
370             } # End of loop through line
371              
372             # Delete the dummy first character
373 0         0 $copy = substr($copy, 1);
374             }
375              
376             # Here, $copy is legal UTF-8.
377              
378             # If it can't be legal CP1252, no need to look further. (These bytes
379             # aren't valid in CP1252.) This test could have been placed higher in
380             # the code, but it seemed wrong to set the encoding to UTF-8 without
381             # making sure that the very first instance is well-formed. But what if
382             # it isn't legal CP1252 either? We have to choose one or the other, and
383             # It seems safer to favor the single-byte encoding over the multi-byte.
384 6 50       19 goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385              
386             # The C1 controls are not likely to appear in pod
387 6 50       27 goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388              
389             # Nor are surrogates nor unassigned, nor deprecated.
390 6 50       38 DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391 6 50 33     51 goto set_1252 if $cs_re && $copy =~ $cs_re;
392 6 50 33     56 DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393 6 50 33     37 goto set_1252 if $cn_re && $copy =~ $cn_re;
394 6 50       34 DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395 6 50       24 goto set_1252 if $copy =~ $deprecated_re;
396              
397             # Nor are rare code points. But this is hard to determine. khw
398             # believes that IPA characters and the modifier letters are unlikely to
399             # be in pod (and certainly very unlikely to be the in the first line in
400             # the pod containing non-ASCII)
401 6 100       44 DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402 6 100 66     37 goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403              
404             # The first Unicode version included essentially every Latin character
405             # in modern usage. So, a Latin character not in the first release will
406             # unlikely be in pod.
407 5 50 33     43 DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408 5 50 33     30 goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409              
410             # On perls that handle script runs, if the UTF-8 interpretation yields
411             # a single script, we guess UTF-8, otherwise just having a mixture of
412             # scripts is suspicious, so guess CP1252. We first strip off, as best
413             # we can, the ASCII characters that look like they are pod directives,
414             # as these would always show as mixed with non-Latin text.
415 5         75 $copy =~ s/$pod_chars_re//g;
416              
417 5 50       19 if ($script_run_re) {
418 0 0       0 goto set_utf8 if $copy =~ $script_run_re;
419 0         0 DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420 0         0 goto set_1252;
421             }
422              
423             # Even without script runs, but on recent enough perls and Unicodes, we
424             # can check if there is a mixture of both Latin and non-Latin. Again,
425             # having a mixture of scripts is suspicious, so assume CP1252
426              
427             # If it's all non-Latin, there is no CP1252, as that is Latin
428             # characters and punct, etc.
429 5 50       38 DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430 5 50       33 goto set_utf8 if $copy !~ $latin_re;
431              
432 5 100       70 DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433 5 100       44 goto set_utf8 if $copy =~ $every_char_is_latin_re;
434              
435 1         2 DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436              
437 17         24 set_1252:
438             DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439 17         35 $encoding = 'CP1252';
440 17         47 goto done_set;
441              
442 4         5 set_utf8:
443             DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444 4         9 $encoding = 'UTF-8';
445              
446 21         128 done_set:
447             $self->_handle_encoding_line( "=encoding $encoding" );
448 21         46 delete $self->{'_processed_encoding'};
449 21 50       101 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450              
451 21         400 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
452              
453             $self->whine(
454 21         171 $self->{'line_count'},
455             "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456             );
457             }
458              
459 18139         26444 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460              
461 18139 100       33122 if(!$self->{'in_pod'}) {
462 1700 100       4884 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
463 941 100       2659 if($1 eq 'cut') {
464             $self->scream(
465 4         40 $self->{'line_count'},
466             "=cut found outside a pod block. Skipping to next block."
467             );
468              
469             ## Before there were errata sections in the world, it was
470             ## least-pessimal to abort processing the file. But now we can
471             ## just barrel on thru (but still not start a pod block).
472             #splice @_;
473             #push @_, undef;
474              
475 4         10 next;
476             } else {
477             $self->{'in_pod'} = $self->{'start_of_pod_block'}
478 937         2586 = $self->{'last_was_blank'} = 1;
479             # And fall thru to the pod-mode block further down
480             }
481             } else {
482 759         972 DEBUG > 5 and print STDERR "# It's a code-line.\n";
483 759 100       1481 $code_handler->(map $_, $line, $self->{'line_count'}, $self)
484             if $code_handler;
485             # Note: this may cause code to be processed out of order relative
486             # to pods, but in order relative to cuts.
487              
488             # Note also that we haven't yet applied the transcoding to $line
489             # by time we call $code_handler!
490              
491 759 50       1572 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
492             # That RE is from perlsyn, section "Plain Old Comments (Not!)",
493             #$fname = $2 if defined $2;
494             #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495 0         0 DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496 0         0 $self->{'line_count'} = $1 - 1;
497             }
498              
499 759         1540 next;
500             }
501             }
502              
503             # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504             # Else we're in pod mode:
505              
506             # Apply any necessary transcoding:
507 17376 100       32515 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508              
509             # HERE WE CATCH =encoding EARLY!
510 17376 100       32270 if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511 39 100       189 next if $self->parse_characters; # Ignore this line
512 38         142 $line = $self->_handle_encoding_line( $line );
513             }
514              
515 17375 100       61699 if($line =~ m/^=cut/s) {
    100          
    100          
516             # here ends the pod block, and therefore the previous pod para
517 166         282 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518 166         384 $self->{'in_pod'} = 0;
519             # ++$self->{'pod_para_count'};
520 166         489 $self->_ponder_paragraph_buffer();
521             # by now it's safe to consider the previous paragraph as done.
522 166         244 DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523 166 100       566 $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524             if $cut_handler;
525              
526             # TODO: add to docs: Note: this may cause cuts to be processed out
527             # of order relative to pods, but in order relative to code.
528              
529             } elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530 5731 100 66     21489 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531 22 100       82 $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532             if $wl_handler;
533             }
534              
535 5731 100 66     26387 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      100        
536 632         941 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537 632         819 push @{$paras->[-1]}, $line;
  632         1482  
538             } # otherwise it's not interesting
539              
540 5731 100 100     18434 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541 5636         7208 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542             }
543              
544 5731         10028 $self->{'last_was_blank'} = 1;
545              
546             } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547              
548 6341 100       19133 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) {
    100          
549             # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550 2855         11865 my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551 2855 100 100     11617 $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552             # Note that in "=head1 foo", the WS is lost.
553             # Example: ['=head1', {'start_line' => 123}, ' foo']
554              
555 2855         4530 ++$self->{'pod_para_count'};
556              
557 2855         8597 $self->_ponder_paragraph_buffer();
558             # by now it's safe to consider the previous paragraph as done.
559              
560 2855         5375 push @$paras, $new; # the new incipient paragraph
561 2855         4335 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562              
563             } elsif($line =~ m/^\s/s) {
564              
565 717 100 33     3904 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      66        
566 173         244 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567 173         254 push @{$paras->[-1]}, $line;
  173         476  
568             } else {
569 544         855 ++$self->{'pod_para_count'};
570 544         1437 $self->_ponder_paragraph_buffer();
571             # by now it's safe to consider the previous paragraph as done.
572 544         757 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573 544         2110 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574             }
575             } else {
576 2769         4226 ++$self->{'pod_para_count'};
577 2769         7229 $self->_ponder_paragraph_buffer();
578             # by now it's safe to consider the previous paragraph as done.
579 2769         9864 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580 2769         4415 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581             }
582 6341         13755 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583              
584             } else {
585             # It's a non-blank line /continuing/ the current para
586 5137 50       9434 if(@$paras) {
587 5137         6209 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588 5137         6284 push @{$paras->[-1]}, $line;
  5137         11370  
589             } else {
590             # Unexpected case!
591 0         0 die "Continuing a paragraph but \@\$paras is empty?";
592             }
593 5137         9883 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594             }
595              
596             } # ends the big while loop
597              
598 9477         12486 DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599 9477         46774 return $self;
600             }
601              
602             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603              
604             sub _handle_encoding_line {
605 59     59   194 my($self, $line) = @_;
606              
607 59 50       183 return if $self->parse_characters;
608              
609             # The point of this routine is to set $self->{'_transcoder'} as indicated.
610              
611 59 50       330 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612 59         104 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613              
614 59         170 my $e = $1;
615 59         104 my $orig = $e;
616 59         115 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
  59         267  
617              
618 59         106 my $enc_error;
619              
620             # Cf. perldoc Encode and perldoc Encode::Supported
621              
622 59         7537 require Pod::Simple::Transcode;
623              
624 59 100 33     226 if( $self->{'encoding'} ) {
    50          
    100          
625 5         24 my $norm_current = $self->{'encoding'};
626 5         12 my $norm_e = $e;
627 5         15 foreach my $that ($norm_current, $norm_e) {
628 10         23 $that = lc($that);
629 10         49 $that =~ s/[-_]//g;
630             }
631 5 100       21 if($norm_current eq $norm_e) {
632 3         4 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633             "redundant. ($norm_current eq $norm_e). Ignoring.\n";
634 3         9 $enc_error = '';
635             # But that doesn't necessarily mean that the earlier one went okay
636             } else {
637 2         27 $enc_error = "Encoding is already set to " . $self->{'encoding'};
638 2         4 DEBUG > 1 and print STDERR $enc_error;
639             }
640             } elsif (
641             # OK, let's turn on the encoding
642             do {
643 54         92 DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644 54         147 $self->{'encoding'} = $e;
645 54         631 1;
646             }
647             and $e eq 'HACKRAW'
648             ) {
649 0         0 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650              
651             } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652              
653             die($enc_error = "WHAT? _transcoder is already set?!")
654 52 50       42757 if $self->{'_transcoder'}; # should never happen
655 52         248 require Pod::Simple::Transcode;
656 52         279 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657 52         120 eval {
658 52         160 my @x = ('', "abc", "123");
659 52         239 $self->{'_transcoder'}->(@x);
660             };
661 52 50       185 $@ && die( $enc_error =
662             "Really unexpected error setting up encoding $e: $@\nAborting"
663             );
664 52         143 $self->{'detected_encoding'} = $e;
665              
666             } else {
667 2         1875 my @supported = Pod::Simple::Transcode::->all_encodings;
668              
669             # Note unsupported, and complain
670 2         2104 DEBUG and print STDERR " Encoding [$e] is unsupported.",
671             "\nSupporteds: @supported\n";
672 2         13 my $suggestion = '';
673              
674             # Look for a near match:
675 2         10 my $norm = lc($e);
676 2         7 $norm =~ tr[-_][]d;
677 2         5 my $n;
678 2         7 foreach my $enc (@supported) {
679 248         323 $n = lc($enc);
680 248         310 $n =~ tr[-_][]d;
681 248 50       419 next unless $n eq $norm;
682 0         0 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683 0         0 last;
684             }
685 2         25 my $encmodver = Pod::Simple::Transcode::->encmodver;
686 2         37 $enc_error = join '' =>
687             "This document probably does not appear as it should, because its ",
688             "\"=encoding $e\" line calls for an unsupported encoding.",
689             $suggestion, " [$encmodver\'s supported encodings are: @supported]"
690             ;
691              
692 2         28 $self->scream( $self->{'line_count'}, $enc_error );
693             }
694 59         115 push @{ $self->{'encoding_command_statuses'} }, $enc_error;
  59         181  
695 59 100       191 if (defined($self->{'_processed_encoding'})) {
696             # Double declaration.
697 1         8 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698             }
699 59         122 $self->{'_processed_encoding'} = $orig;
700              
701 59         127 return $line;
702             }
703              
704             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705              
706             sub _handle_encoding_second_level {
707             # By time this is called, the encoding (if well formed) will already
708             # have been acted on.
709 38     38   123 my($self, $para) = @_;
710 38         117 my @x = @$para;
711 38         138 my $content = join ' ', splice @x, 2;
712 38         112 $content =~ s/^\s+//s;
713 38         107 $content =~ s/\s+$//s;
714              
715 38         66 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716              
717 38 100       103 if (defined($self->{'_processed_encoding'})) {
718             #if($content ne $self->{'_processed_encoding'}) {
719             # Could it happen?
720             #}
721 37         98 delete $self->{'_processed_encoding'};
722             # It's already been handled. Check for errors.
723 37 50       184 if(! $self->{'encoding_command_statuses'} ) {
    100          
724 0         0 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725             } elsif( $self->{'encoding_command_statuses'}[-1] ) {
726             $self->whine( $para->[1]{'start_line'},
727             sprintf "Couldn't do %s: %s",
728             $self->{'encoding_command_reqs' }[-1],
729 4         73 $self->{'encoding_command_statuses'}[-1],
730             );
731             } else {
732 33         55 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733             }
734              
735             } else {
736             # Otherwise it's a syntax error
737 1         20 $self->whine( $para->[1]{'start_line'},
738             "Invalid =encoding syntax: $content"
739             );
740             }
741              
742 38         83 return;
743             }
744              
745             #~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746              
747             {
748             my $m = -321; # magic line number
749              
750             sub _gen_errata {
751 890     890   1416 my $self = $_[0];
752             # Return 0 or more fake-o paragraphs explaining the accumulated
753             # errors on this document.
754              
755 890 100 66     3006 return() unless $self->{'errata'} and keys %{$self->{'errata'}};
  40         235  
756              
757 40         95 my @out;
758              
759 40         79 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
  26         74  
  40         204  
760             push @out,
761             ['=item', {'start_line' => $m}, "Around line $line:"],
762             map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763             #['~Top', {'start_line' => $m},
764             $_
765             #]
766             ],
767 59         291 @{$self->{'errata'}{$line}}
  59         372  
768             )
769             ;
770             }
771              
772             # TODO: report of unknown entities? unrenderable characters?
773              
774 40         469 unshift @out,
775             ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776             ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777             "Hey! ",
778             ['B', {},
779             'The above document had some coding errors, which are explained below:'
780             ]
781             ],
782             ['=over', {'start_line' => $m, 'errata' => 1}, ''],
783             ;
784              
785 40         189 push @out,
786             ['=back', {'start_line' => $m, 'errata' => 1}, ''],
787             ;
788              
789 40         86 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790              
791 40         159 return @out;
792             }
793              
794             }
795              
796             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797              
798             ##############################################################################
799             ##
800             ## stop reading now stop reading now stop reading now stop reading now stop
801             ##
802             ## HERE IT BECOMES REALLY SCARY
803             ##
804             ## stop reading now stop reading now stop reading now stop reading now stop
805             ##
806             ##############################################################################
807              
808             sub _ponder_paragraph_buffer {
809              
810             # Para-token types as found in the buffer.
811             # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812             # =over, =back, =item
813             # and the null =pod (to be complained about if over one line)
814             #
815             # "~data" paragraphs are something we generate at this level, depending on
816             # a currently open =over region
817              
818             # Events fired: Begin and end for:
819             # directivename (like head1 .. head4), item, extend,
820             # for (from =begin...=end, =for),
821             # over-bullet, over-number, over-text, over-block,
822             # item-bullet, item-number, item-text,
823             # Document,
824             # Data, Para, Verbatim
825             # B, C, longdirname (TODO -- wha?), etc. for all directives
826             #
827              
828 7224     7224   10817 my $self = $_[0];
829 7224         9573 my $paras;
830 7224 100       9119 return unless @{$paras = $self->{'paras'}};
  7224         16797  
831 6290   100     16328 my $curr_open = ($self->{'curr_open'} ||= []);
832              
833 6290         8542 my $scratch;
834              
835 6290         7845 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836              
837             # We have something in our buffer. So apparently the document has started.
838 6290 100       11846 unless($self->{'doc_has_started'}) {
839 904         2302 $self->{'doc_has_started'} = 1;
840              
841 904         1270 my $starting_contentless;
842 904   66     5347 $starting_contentless =
843             (
844             !@$curr_open
845             and @$paras and ! grep $_->[0] ne '~end', @$paras
846             # i.e., if the paras is all ~ends
847             )
848             ;
849 904         1314 DEBUG and print STDERR "# Starting ",
850             $starting_contentless ? 'contentless' : 'contentful',
851             " document\n"
852             ;
853              
854             $self->_handle_element_start(
855             ($scratch = 'Document'),
856             {
857 904 100       4710 'start_line' => $paras->[0][1]{'start_line'},
858             $starting_contentless ? ( 'contentless' => 1 ) : (),
859             },
860             );
861             }
862              
863 6290         10917 my($para, $para_type);
864 6290         12276 while(@$paras) {
865              
866             # If a directive, assume it's legal; subtract below if found not to be
867 8981 100       25179 $seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868              
869             last if @$paras == 1
870             and ( $paras->[0][0] eq '=over'
871             or $paras->[0][0] eq '=item'
872 8981 100 100     37861 or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
      100        
873             # Those're the three kinds of paragraphs that require lookahead.
874             # Actually, an "=item Foo" inside an region
875             # and any =item inside an region (rare)
876             # don't require any lookahead, but all others (bullets
877             # and numbers) do.
878             # The verbatim is different from the other two, because those might be
879             # like:
880             #
881             # =item
882             # ...
883             # =cut
884             # ...
885             # =item
886             #
887             # The =cut here finishes the paragraph but doesn't terminate the =over
888             # they should be in. (khw apologizes that he didn't comment at the time
889             # why the 'in_pod' works, and no longer remembers why, and doesn't think
890             # it is currently worth the effort to re-figure it out.)
891              
892             # TODO: whinge about many kinds of directives in non-resolving =for regions?
893             # TODO: many? like what? =head1 etc?
894              
895 7466         14801 $para = shift @$paras;
896 7466         12976 $para_type = $para->[0];
897              
898 7466         9519 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899             $self->_dump_curr_open(), ")\n";
900              
901 7466 100       23502 if($para_type eq '=for') {
    100          
    100          
    100          
902 50 50       140 next if $self->_ponder_for($para,$curr_open,$paras);
903              
904             } elsif($para_type eq '=begin') {
905 127 50       321 next if $self->_ponder_begin($para,$curr_open,$paras);
906              
907             } elsif($para_type eq '=end') {
908 130 50       335 next if $self->_ponder_end($para,$curr_open,$paras);
909              
910             } elsif($para_type eq '~end') { # The virtual end-document signal
911 940 50       2806 next if $self->_ponder_doc_end($para,$curr_open,$paras);
912             }
913              
914              
915             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917 6219 100       15507 if(grep $_->[1]{'~ignore'}, @$curr_open) {
918 42         58 DEBUG > 1 and
919             print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920 42         83 next;
921             }
922             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924              
925 6177 100       15371 if($para_type eq '=pod') {
    100          
    100          
926 578         1669 $self->_ponder_pod($para,$curr_open,$paras);
927              
928             } elsif($para_type eq '=over') {
929 213 100       840 next if $self->_ponder_over($para,$curr_open,$paras);
930              
931             } elsif($para_type eq '=back') {
932 209 100       808 next if $self->_ponder_back($para,$curr_open,$paras);
933              
934             } else {
935              
936             # All non-magical codes!!!
937              
938             # Here we start using $para_type for our own twisted purposes, to
939             # mean how it should get treated, not as what the element name
940             # should be.
941              
942 5177         6722 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943              
944 5177         6683 my $i;
945              
946             # Enforce some =headN discipline
947 5177 100 66     14525 if($para_type =~ m/^=head\d$/s
      100        
      66        
948             and ! $self->{'accept_heads_anywhere'}
949             and @$curr_open
950             and $curr_open->[-1][0] eq '=over'
951             ) {
952 6         10 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953             $self->whine(
954 6         32 $para->[1]{'start_line'},
955             "You forgot a '=back' before '$para_type'"
956             );
957 6         31 unshift @$paras, ['=back', {}, ''], $para; # close the =over
958 6         16 next;
959             }
960              
961              
962 5171 100 66     19466 if($para_type eq '=item') {
    100          
    100          
    100          
    100          
    100          
    100          
963              
964 1022         1444 my $over;
965 1022 50 33     2715 unless(@$curr_open and
966 1130         4301 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967             $self->whine(
968 0         0 $para->[1]{'start_line'},
969             "'=item' outside of any '=over'"
970             );
971             unshift @$paras,
972 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973             $para
974             ;
975 0         0 next;
976             }
977              
978              
979 1022         2087 my $over_type = $over->[1]{'~type'};
980              
981 1022 50       3583 if(!$over_type) {
    50          
    100          
    100          
    50          
982             # Shouldn't happen1
983             die "Typeless over in stack, starting at line "
984 0         0 . $over->[1]{'start_line'};
985              
986             } elsif($over_type eq 'block') {
987 0 0       0 unless($curr_open->[-1][1]{'~bitched_about'}) {
988 0         0 $curr_open->[-1][1]{'~bitched_about'} = 1;
989             $self->whine(
990             $curr_open->[-1][1]{'start_line'},
991             "You can't have =items (as at line "
992 0         0 . $para->[1]{'start_line'}
993             . ") unless the first thing after the =over is an =item"
994             );
995             }
996             # Just turn it into a paragraph and reconsider it
997 0         0 $para->[0] = '~Para';
998 0         0 unshift @$paras, $para;
999 0         0 next;
1000              
1001             } elsif($over_type eq 'text') {
1002 835         2428 my $item_type = $self->_get_item_type($para);
1003             # That kills the content of the item if it's a number or bullet.
1004 835         1307 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005              
1006 835 100 66     1878 if($item_type eq 'text') {
    50          
1007             # Nothing special needs doing for 'text'
1008             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009             $self->whine(
1010 4         41 $para->[1]{'start_line'},
1011             "Expected text after =item, not a $item_type"
1012             );
1013             # Undo our clobbering:
1014 4         14 push @$para, $para->[1]{'~orig_content'};
1015 4         8 delete $para->[1]{'number'};
1016             # Only a PROPER item-number element is allowed
1017             # to have a number attribute.
1018             } else {
1019 0         0 die "Unhandled item type $item_type"; # should never happen
1020             }
1021              
1022             # =item-text thingies don't need any assimilation, it seems.
1023              
1024             } elsif($over_type eq 'number') {
1025 28         109 my $item_type = $self->_get_item_type($para);
1026             # That kills the content of the item if it's a number or bullet.
1027 28         46 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028              
1029 28         74 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030              
1031 28 50       179 if($item_type eq 'bullet') {
    50          
    50          
    50          
1032             # Hm, it's not numeric. Correct for this.
1033 0         0 $para->[1]{'number'} = $expected_value;
1034             $self->whine(
1035 0         0 $para->[1]{'start_line'},
1036             "Expected '=item $expected_value'"
1037             );
1038 0         0 push @$para, $para->[1]{'~orig_content'};
1039             # restore the bullet, blocking the assimilation of next para
1040              
1041             } elsif($item_type eq 'text') {
1042             # Hm, it's not numeric. Correct for this.
1043 0         0 $para->[1]{'number'} = $expected_value;
1044             $self->whine(
1045 0         0 $para->[1]{'start_line'},
1046             "Expected '=item $expected_value'"
1047             );
1048             # Text content will still be there and will block next ~Para
1049              
1050             } elsif($item_type ne 'number') {
1051 0         0 die "Unknown item type $item_type"; # should never happen
1052              
1053             } elsif($expected_value == $para->[1]{'number'}) {
1054 28         46 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055              
1056             } else {
1057 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058             " instead of the expected value of $expected_value\n";
1059             $self->whine(
1060             $para->[1]{'start_line'},
1061 0         0 "You have '=item " . $para->[1]{'number'} .
1062             "' instead of the expected '=item $expected_value'"
1063             );
1064 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1065             }
1066              
1067 28 50       86 if(@$para == 2) {
1068             # For the cases where we /didn't/ push to @$para
1069 28 100       64 if($paras->[0][0] eq '~Para') {
1070 25         37 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071 25         39 push @$para, splice @{shift @$paras},2;
  25         70  
1072             } else {
1073 3         6 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074 3         5 push @$para, ''; # Just so it's not contentless
1075             }
1076             }
1077              
1078              
1079             } elsif($over_type eq 'bullet') {
1080 159         475 my $item_type = $self->_get_item_type($para);
1081             # That kills the content of the item if it's a number or bullet.
1082 159         263 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083              
1084 159 50       350 if($item_type eq 'bullet') {
    0          
    0          
1085             # as expected!
1086              
1087 159 100       367 if( $para->[1]{'~_freaky_para_hack'} ) {
1088 101         131 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089 101         197 push @$para, $para->[1]{'~_freaky_para_hack'};
1090             }
1091              
1092             } elsif($item_type eq 'number') {
1093             $self->whine(
1094 0         0 $para->[1]{'start_line'},
1095             "Expected '=item *'"
1096             );
1097 0         0 push @$para, $para->[1]{'~orig_content'};
1098             # and block assimilation of the next paragraph
1099 0         0 delete $para->[1]{'number'};
1100             # Only a PROPER item-number element is allowed
1101             # to have a number attribute.
1102             } elsif($item_type eq 'text') {
1103             $self->whine(
1104 0         0 $para->[1]{'start_line'},
1105             "Expected '=item *'"
1106             );
1107             # But doesn't need processing. But it'll block assimilation
1108             # of the next para.
1109             } else {
1110 0         0 die "Unhandled item type $item_type"; # should never happen
1111             }
1112              
1113 159 100       398 if(@$para == 2) {
1114             # For the cases where we /didn't/ push to @$para
1115 58 50       152 if($paras->[0][0] eq '~Para') {
1116 58         88 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117 58         92 push @$para, splice @{shift @$paras},2;
  58         160  
1118             } else {
1119 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120 0         0 push @$para, ''; # Just so it's not contentless
1121             }
1122             }
1123              
1124             } else {
1125 0         0 die "Unhandled =over type \"$over_type\"?";
1126             # Shouldn't happen!
1127             }
1128              
1129 1022         1701 $para_type = 'Plain';
1130 1022         2771 $para->[0] .= '-' . $over_type;
1131             # Whew. Now fall thru and process it.
1132              
1133              
1134             } elsif($para_type eq '=extend') {
1135             # Well, might as well implement it here.
1136 21         76 $self->_ponder_extend($para);
1137 21         50 next; # and skip
1138             } elsif($para_type eq '=encoding') {
1139             # Not actually acted on here, but we catch errors here.
1140 38         205 $self->_handle_encoding_second_level($para);
1141 38 100       130 next unless $self->keep_encoding_directive;
1142 35         77 $para_type = 'Plain';
1143             } elsif($para_type eq '~Verbatim') {
1144 538         972 $para->[0] = 'Verbatim';
1145 538         810 $para_type = '?Verbatim';
1146             } elsif($para_type eq '~Para') {
1147 2772         4592 $para->[0] = 'Para';
1148 2772         4243 $para_type = '?Plain';
1149             } elsif($para_type eq 'Data') {
1150 30         56 $para->[0] = 'Data';
1151 30         45 $para_type = '?Data';
1152             } elsif( $para_type =~ s/^=//s
1153             and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154             ) {
1155 738         1323 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156             } else {
1157             # An unknown directive!
1158 12         37 $seen_legal_directive--;
1159             DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160 12         22 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161             ;
1162             $self->whine(
1163 12         129 $para->[1]{'start_line'},
1164             "Unknown directive: $para->[0]"
1165             );
1166              
1167             # And maybe treat it as text instead of just letting it go?
1168 12         51 next;
1169             }
1170              
1171 5135 100       17627 if($para_type =~ s/^\?//s) {
1172 3340 100       6775 if(! @$curr_open) { # usual case
1173 2066         2841 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174             } else {
1175 1274         3373 my @fors = grep $_->[0] eq '=for', @$curr_open;
1176             DEBUG > 1 and print STDERR "Containing fors: ",
1177 1274         1783 join(',', map $_->[1]{'target'}, @fors), "\n";
1178              
1179 1274 100       2590 if(! @fors) {
    100          
1180 1150         1718 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181              
1182             #} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183             #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184             } elsif( $fors[-1][1]{'~resolve'} ) {
1185             # Look to the immediately containing for
1186              
1187 73 100       154 if($para_type eq 'Data') {
1188 18         24 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189 18         29 $para->[0] = 'Para';
1190 18         40 $para_type = 'Plain';
1191             } else {
1192 55         85 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193             }
1194             } else {
1195 51         88 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196 51         136 $para->[0] = $para_type = 'Data';
1197             }
1198             }
1199             }
1200              
1201             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202 5135 100       10340 if($para_type eq 'Plain') {
    100          
    50          
1203 4547         10516 $self->_ponder_Plain($para);
1204             } elsif($para_type eq 'Verbatim') {
1205 534         1429 $self->_ponder_Verbatim($para);
1206             } elsif($para_type eq 'Data') {
1207 54         174 $self->_ponder_Data($para);
1208             } else {
1209 0         0 die "\$para type is $para_type -- how did that happen?";
1210             # Shouldn't happen.
1211             }
1212              
1213             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214 5135         13931 $para->[0] =~ s/^[~=]//s;
1215              
1216 5135         7329 DEBUG and print STDERR "\n", pretty($para), "\n";
1217              
1218             # traverse the treelet (which might well be just one string scalar)
1219             $self->{'content_seen'} ||= 1 if $seen_legal_directive
1220 5135 100 100     22584 && ! $self->{'~tried_gen_errata'};
      100        
1221 5135         12610 $self->_traverse_treelet_bit(@$para);
1222             }
1223             }
1224              
1225 6290         17108 return;
1226             }
1227              
1228             ###########################################################################
1229             # The sub-ponderers...
1230              
1231              
1232              
1233             sub _ponder_for {
1234 50     50   124 my ($self,$para,$curr_open,$paras) = @_;
1235              
1236             # Fake it out as a begin/end
1237 50         70 my $target;
1238              
1239 50 50       128 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240 0         0 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241 0         0 return 1;
1242             }
1243              
1244 50         151 for(my $i = 2; $i < @$para; ++$i) {
1245 50 50       234 if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1246 50         107 $target = $1;
1247 50         83 last;
1248             }
1249             }
1250 50 50       115 unless(defined $target) {
1251             $self->whine(
1252 0         0 $para->[1]{'start_line'},
1253             "=for without a target?"
1254             );
1255 0         0 return 1;
1256             }
1257 50         70 DEBUG > 1 and
1258             print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259              
1260 50         115 $para->[0] = 'Data';
1261              
1262             unshift @$paras,
1263             ['=begin',
1264             {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265             $target,
1266             ],
1267             $para,
1268             ['=end',
1269 50         320 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270             $target,
1271             ],
1272             ;
1273              
1274 50         185 return 1;
1275             }
1276              
1277             sub _ponder_begin {
1278 127     127   253 my ($self,$para,$curr_open,$paras) = @_;
1279 127         333 my $content = join ' ', splice @$para, 2;
1280 127         310 $content =~ s/^\s+//s;
1281 127         283 $content =~ s/\s+$//s;
1282 127 50       279 unless(length($content)) {
1283             $self->whine(
1284 0         0 $para->[1]{'start_line'},
1285             "=begin without a target?"
1286             );
1287 0         0 DEBUG and print STDERR "Ignoring targetless =begin\n";
1288 0         0 return 1;
1289             }
1290              
1291 127         562 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1292 127 100       330 $para->[1]{'title'} = $title if ($title);
1293 127         330 $para->[1]{'target'} = $target; # without any ':'
1294 127         183 $content = $target; # strip off the title
1295              
1296 127         208 $content =~ s/^:!/!:/s;
1297 127         173 my $neg; # whether this is a negation-match
1298 127 100       395 $neg = 1 if $content =~ s/^!//s;
1299 127         229 my $to_resolve; # whether to process formatting codes
1300 127 100       322 $to_resolve = 1 if $content =~ s/^://s;
1301              
1302 127         189 my $dont_ignore; # whether this target matches us
1303              
1304 127 100       447 foreach my $target_name (
1305             split(',', $content, -1),
1306             $neg ? () : '*'
1307             ) {
1308 249         316 DEBUG > 2 and
1309             print STDERR " Considering whether =begin $content matches $target_name\n";
1310 249 100       570 next unless $self->{'accept_targets'}{$target_name};
1311              
1312 66         91 DEBUG > 2 and
1313             print STDERR " It DOES match the acceptable target $target_name!\n";
1314             $to_resolve = 1
1315 66 100       179 if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316 66         105 $dont_ignore = 1;
1317 66         155 $para->[1]{'target_matching'} = $target_name;
1318 66         103 last; # stop looking at other target names
1319             }
1320              
1321 127 100       304 if($neg) {
1322 33 100       67 if( $dont_ignore ) {
1323 6         12 $dont_ignore = '';
1324 6         11 delete $para->[1]{'target_matching'};
1325 6         9 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326             } else {
1327 27         44 $dont_ignore = 1;
1328 27         55 $para->[1]{'target_matching'} = '!';
1329 27         41 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330             }
1331             }
1332              
1333 127         198 $para->[0] = '=for'; # Just what we happen to call these, internally
1334 127   100     448 $para->[1]{'~really'} ||= '=begin';
1335 127   100     397 $para->[1]{'~ignore'} = (! $dont_ignore) || 0;
1336 127   100     326 $para->[1]{'~resolve'} = $to_resolve || 0;
1337              
1338 127         152 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339             "ignore contents of this region\n";
1340 127         170 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341             ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342 127         176 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343              
1344 127         244 push @$curr_open, $para;
1345 127 100 100     505 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346 40         54 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347             } else {
1348 87 50 100     281 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1349 87         291 $self->_handle_element_start((my $scratch='for'), $para->[1]);
1350             }
1351              
1352 127         449 return 1;
1353             }
1354              
1355             sub _ponder_end {
1356 130     130   293 my ($self,$para,$curr_open,$paras) = @_;
1357 130         373 my $content = join ' ', splice @$para, 2;
1358 130         331 $content =~ s/^\s+//s;
1359 130         292 $content =~ s/\s+$//s;
1360 130         182 DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361              
1362 130 50       281 unless(length($content)) {
1363             $self->whine(
1364             $para->[1]{'start_line'},
1365             "'=end' without a target?" . (
1366             ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367 0 0 0     0 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368             : ''
1369             )
1370             );
1371 0         0 DEBUG and print STDERR "Ignoring targetless =end\n";
1372 0         0 return 1;
1373             }
1374              
1375 130 50       404 unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376             $self->whine(
1377 0         0 $para->[1]{'start_line'},
1378             "'=end $content' is invalid. (Stack: "
1379             . $self->_dump_curr_open() . ')'
1380             );
1381 0         0 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382 0         0 return 1;
1383             }
1384              
1385 130 50 33     496 unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386             $self->whine(
1387 0         0 $para->[1]{'start_line'},
1388             "=end $content without matching =begin. (Stack: "
1389             . $self->_dump_curr_open() . ')'
1390             );
1391 0         0 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392 0         0 return 1;
1393             }
1394              
1395 130 100       324 unless($content eq $curr_open->[-1][1]{'target'}) {
1396             $self->whine(
1397             $para->[1]{'start_line'},
1398             "=end $content doesn't match =begin "
1399 3         20 . $curr_open->[-1][1]{'target'}
1400             . ". (Stack: "
1401             . $self->_dump_curr_open() . ')'
1402             );
1403 3         4 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404 3         13 return 1;
1405             }
1406              
1407             # Else it's okay to close...
1408 127 100       376 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409 40         52 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410             # And that may be because of this to-be-closed =for region, or some
1411             # other one, but it doesn't matter.
1412             } else {
1413 87         165 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414             # what's that for?
1415              
1416 87 50 50     266 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1417 87         274 $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418             }
1419 127         208 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420 127         203 pop @$curr_open;
1421              
1422 127         602 return 1;
1423             }
1424              
1425             sub _ponder_doc_end {
1426 940     940   2277 my ($self,$para,$curr_open,$paras) = @_;
1427 940 100       1966 if(@$curr_open) { # Deal with things left open
1428 10         16 DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429             $self->_dump_curr_open(), ")\n";
1430              
1431 10         19 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432 10         56 unshift @$paras, $self->_closers_for_all_curr_open;
1433             # Make sure there is exactly one ~end in the parastack, at the end:
1434 10         56 @$paras = grep $_->[0] ne '~end', @$paras;
1435 10         27 push @$paras, $para, $para;
1436             # We need two -- once for the next cycle where we
1437             # generate errata, and then another to be at the end
1438             # when that loop back around to process the errata.
1439 10         44 return 1;
1440              
1441             } else {
1442 930         1318 DEBUG and print STDERR "Okay, stack is empty now.\n";
1443             }
1444              
1445             # Try generating errata section, if applicable
1446 930 100       2177 unless($self->{'~tried_gen_errata'}) {
1447 890         1728 $self->{'~tried_gen_errata'} = 1;
1448 890         2292 my @extras = $self->_gen_errata();
1449 890 100       2119 if(@extras) {
1450 40         118 unshift @$paras, @extras;
1451 40         82 DEBUG and print STDERR "Generated errata... relooping...\n";
1452 40         199 return 1; # I.e., loop around again to process these fake-o paragraphs
1453             }
1454             }
1455              
1456 890         1716 splice @$paras; # Well, that's that for this paragraph buffer.
1457 890         1258 DEBUG and print STDERR "Throwing end-document event.\n";
1458              
1459 890         2790 $self->_handle_element_end( my $scratch = 'Document' );
1460 890         3175 return 1; # Hasta la byebye
1461             }
1462              
1463             sub _ponder_pod {
1464 578     578   1287 my ($self,$para,$curr_open,$paras) = @_;
1465             $self->whine(
1466 578 50       1309 $para->[1]{'start_line'},
1467             "=pod directives shouldn't be over one line long! Ignoring all "
1468             . (@$para - 2) . " lines of content"
1469             ) if @$para > 3;
1470              
1471             # Content ignored unless 'pod_handler' is set
1472 578 100       1304 if (my $pod_handler = $self->{'pod_handler'}) {
1473 6         22 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474 6 100       18 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475 6         15 $pod_handler->($line, $line_num, $self);
1476             }
1477              
1478             # The surrounding methods set content_seen, so let us remain consistent.
1479             # I do not know why it was not here before -- should it not be here?
1480             # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1481              
1482 578         1614 return;
1483             }
1484              
1485             sub _ponder_over {
1486 213     213   569 my ($self,$para,$curr_open,$paras) = @_;
1487 213 50       497 return 1 unless @$paras;
1488 213         345 my $list_type;
1489              
1490 213 100       666 if($paras->[0][0] eq '=item') { # most common case
    100          
    50          
1491 185         717 $list_type = $self->_get_initial_item_type($paras->[0]);
1492              
1493             } elsif($paras->[0][0] eq '=back') {
1494             # Ignore empty lists by default
1495 6 100       19 if ($self->{'parse_empty_lists'}) {
1496 2         4 $list_type = 'empty';
1497             } else {
1498 4         8 shift @$paras;
1499 4         19 return 1;
1500             }
1501             } elsif($paras->[0][0] eq '~end') {
1502             $self->whine(
1503 0         0 $para->[1]{'start_line'},
1504             "=over is the last thing in the document?!"
1505             );
1506 0         0 return 1; # But feh, ignore it.
1507             } else {
1508 22         36 $list_type = 'block';
1509             }
1510 209         484 $para->[1]{'~type'} = $list_type;
1511 209         473 push @$curr_open, $para;
1512             # yes, we reuse the paragraph as a stack item
1513              
1514 209         719 my $content = join ' ', splice @$para, 2;
1515 209         527 $para->[1]{'~orig_content'} = $content;
1516 209         307 my $overness;
1517 209 100       1081 if($content =~ m/^\s*$/s) {
    50          
1518 119         352 $para->[1]{'indent'} = 4;
1519             } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1520 68     68   496223 no integer;
  68         216  
  68         472  
1521 90         310 $para->[1]{'indent'} = $1;
1522 90 50       279 if($1 == 0) {
1523             $self->whine(
1524 0         0 $para->[1]{'start_line'},
1525             "Can't have a 0 in =over $content"
1526             );
1527 0         0 $para->[1]{'indent'} = 4;
1528             }
1529             } else {
1530             $self->whine(
1531 0         0 $para->[1]{'start_line'},
1532             "=over should be: '=over' or '=over positive_number'"
1533             );
1534 0         0 $para->[1]{'indent'} = 4;
1535             }
1536 209         336 DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537              
1538 209 100 100     812 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1539 209         944 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540              
1541 209         776 return;
1542             }
1543              
1544             sub _ponder_back {
1545 209     209   557 my ($self,$para,$curr_open,$paras) = @_;
1546             # TODO: fire off or or ??
1547              
1548 209         582 my $content = join ' ', splice @$para, 2;
1549 209 50       561 if($content =~ m/\S/) {
1550             $self->whine(
1551 0         0 $para->[1]{'start_line'},
1552             "=back doesn't take any parameters, but you said =back $content"
1553             );
1554             }
1555              
1556 209 50 33     939 if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557 209         322 DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558             # Expected case: we're closing the most recently opened thing
1559             #my $over = pop @$curr_open;
1560 209 100 50     702 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1561             $self->_handle_element_end( my $scratch =
1562 209         933 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563             );
1564             } else {
1565 0         0 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566             join(', ', map $_->[0], @$curr_open), ").\n";
1567             $self->whine(
1568 0         0 $para->[1]{'start_line'},
1569             '=back without =over'
1570             );
1571 0         0 return 1; # and ignore it
1572             }
1573             }
1574              
1575             sub _ponder_item {
1576 0     0   0 my ($self,$para,$curr_open,$paras) = @_;
1577 0         0 my $over;
1578 0 0 0     0 unless(@$curr_open and
1579 0         0 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580             $self->whine(
1581 0         0 $para->[1]{'start_line'},
1582             "'=item' outside of any '=over'"
1583             );
1584             unshift @$paras,
1585 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586             $para
1587             ;
1588 0         0 return 1;
1589             }
1590              
1591              
1592 0         0 my $over_type = $over->[1]{'~type'};
1593              
1594 0 0       0 if(!$over_type) {
    0          
    0          
    0          
    0          
1595             # Shouldn't happen1
1596             die "Typeless over in stack, starting at line "
1597 0         0 . $over->[1]{'start_line'};
1598              
1599             } elsif($over_type eq 'block') {
1600 0 0       0 unless($curr_open->[-1][1]{'~bitched_about'}) {
1601 0         0 $curr_open->[-1][1]{'~bitched_about'} = 1;
1602             $self->whine(
1603             $curr_open->[-1][1]{'start_line'},
1604             "You can't have =items (as at line "
1605 0         0 . $para->[1]{'start_line'}
1606             . ") unless the first thing after the =over is an =item"
1607             );
1608             }
1609             # Just turn it into a paragraph and reconsider it
1610 0         0 $para->[0] = '~Para';
1611 0         0 unshift @$paras, $para;
1612 0         0 return 1;
1613              
1614             } elsif($over_type eq 'text') {
1615 0         0 my $item_type = $self->_get_item_type($para);
1616             # That kills the content of the item if it's a number or bullet.
1617 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618              
1619 0 0 0     0 if($item_type eq 'text') {
    0          
1620             # Nothing special needs doing for 'text'
1621             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622             $self->whine(
1623 0         0 $para->[1]{'start_line'},
1624             "Expected text after =item, not a $item_type"
1625             );
1626             # Undo our clobbering:
1627 0         0 push @$para, $para->[1]{'~orig_content'};
1628 0         0 delete $para->[1]{'number'};
1629             # Only a PROPER item-number element is allowed
1630             # to have a number attribute.
1631             } else {
1632 0         0 die "Unhandled item type $item_type"; # should never happen
1633             }
1634              
1635             # =item-text thingies don't need any assimilation, it seems.
1636              
1637             } elsif($over_type eq 'number') {
1638 0         0 my $item_type = $self->_get_item_type($para);
1639             # That kills the content of the item if it's a number or bullet.
1640 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641              
1642 0         0 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643              
1644 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
    0          
1645             # Hm, it's not numeric. Correct for this.
1646 0         0 $para->[1]{'number'} = $expected_value;
1647             $self->whine(
1648 0         0 $para->[1]{'start_line'},
1649             "Expected '=item $expected_value'"
1650             );
1651 0         0 push @$para, $para->[1]{'~orig_content'};
1652             # restore the bullet, blocking the assimilation of next para
1653              
1654             } elsif($item_type eq 'text') {
1655             # Hm, it's not numeric. Correct for this.
1656 0         0 $para->[1]{'number'} = $expected_value;
1657             $self->whine(
1658 0         0 $para->[1]{'start_line'},
1659             "Expected '=item $expected_value'"
1660             );
1661             # Text content will still be there and will block next ~Para
1662              
1663             } elsif($item_type ne 'number') {
1664 0         0 die "Unknown item type $item_type"; # should never happen
1665              
1666             } elsif($expected_value == $para->[1]{'number'}) {
1667 0         0 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668              
1669             } else {
1670 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671             " instead of the expected value of $expected_value\n";
1672             $self->whine(
1673             $para->[1]{'start_line'},
1674 0         0 "You have '=item " . $para->[1]{'number'} .
1675             "' instead of the expected '=item $expected_value'"
1676             );
1677 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1678             }
1679              
1680 0 0       0 if(@$para == 2) {
1681             # For the cases where we /didn't/ push to @$para
1682 0 0       0 if($paras->[0][0] eq '~Para') {
1683 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1685             } else {
1686 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687 0         0 push @$para, ''; # Just so it's not contentless
1688             }
1689             }
1690              
1691              
1692             } elsif($over_type eq 'bullet') {
1693 0         0 my $item_type = $self->_get_item_type($para);
1694             # That kills the content of the item if it's a number or bullet.
1695 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696              
1697 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
1698             # as expected!
1699              
1700 0 0       0 if( $para->[1]{'~_freaky_para_hack'} ) {
1701 0         0 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702 0         0 push @$para, $para->[1]{'~_freaky_para_hack'};
1703             }
1704              
1705             } elsif($item_type eq 'number') {
1706             $self->whine(
1707 0         0 $para->[1]{'start_line'},
1708             "Expected '=item *'"
1709             );
1710 0         0 push @$para, $para->[1]{'~orig_content'};
1711             # and block assimilation of the next paragraph
1712 0         0 delete $para->[1]{'number'};
1713             # Only a PROPER item-number element is allowed
1714             # to have a number attribute.
1715             } elsif($item_type eq 'text') {
1716             $self->whine(
1717 0         0 $para->[1]{'start_line'},
1718             "Expected '=item *'"
1719             );
1720             # But doesn't need processing. But it'll block assimilation
1721             # of the next para.
1722             } else {
1723 0         0 die "Unhandled item type $item_type"; # should never happen
1724             }
1725              
1726 0 0       0 if(@$para == 2) {
1727             # For the cases where we /didn't/ push to @$para
1728 0 0       0 if($paras->[0][0] eq '~Para') {
1729 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1731             } else {
1732 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733 0         0 push @$para, ''; # Just so it's not contentless
1734             }
1735             }
1736              
1737             } else {
1738 0         0 die "Unhandled =over type \"$over_type\"?";
1739             # Shouldn't happen!
1740             }
1741 0         0 $para->[0] .= '-' . $over_type;
1742              
1743 0         0 return;
1744             }
1745              
1746             sub _ponder_Plain {
1747 4547     4547   8598 my ($self,$para) = @_;
1748 4547         6540 DEBUG and print STDERR " giving plain treatment...\n";
1749 4547 100 100     25638 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
      66        
      100        
1750             or $para->[1]{'~cooked'}
1751             ) {
1752             push @$para,
1753 4442         6911 @{$self->_make_treelet(
1754             join("\n", splice(@$para, 2)),
1755 4442         20743 $para->[1]{'start_line'}
1756             )};
1757             }
1758             # Empty paragraphs don't need a treelet for any reason I can see.
1759             # And precooked paragraphs already have a treelet.
1760 4547         9488 return;
1761             }
1762              
1763             sub _ponder_Verbatim {
1764 534     534   1088 my ($self,$para) = @_;
1765 534         734 DEBUG and print STDERR " giving verbatim treatment...\n";
1766              
1767 534         1148 $para->[1]{'xml:space'} = 'preserve';
1768              
1769 534 100       1232 unless ($self->{'_output_is_for_JustPod'}) {
1770             # Fix illegal settings for expand_verbatim_tabs()
1771             # This is because this module doesn't do input error checking, but khw
1772             # doesn't want to add yet another instance of that.
1773 453         1312 my $tab_width = $self->expand_verbatim_tabs;
1774 453 100 100     2014 $tab_width = $self->expand_verbatim_tabs(8)
1775             if ! defined $tab_width
1776             || $tab_width =~ /\D/;
1777              
1778 453         1136 my $indent = $self->strip_verbatim_indent;
1779 453 100 100     1091 if ($indent && ref $indent eq 'CODE') {
1780 10         14 my @shifted = (shift @{$para}, shift @{$para});
  10         18  
  10         31  
1781 10         32 $indent = $indent->($para);
1782 10         106 unshift @{$para}, @shifted;
  10         30  
1783             }
1784              
1785 453         1144 for(my $i = 2; $i < @$para; $i++) {
1786 2077         3227 foreach my $line ($para->[$i]) { # just for aliasing
1787             # Strip indentation.
1788 2077 100       3491 $line =~ s/^\Q$indent// if $indent;
1789 2077 100       3315 next unless $tab_width;
1790              
1791             # This is commented out because of github issue #85, and the
1792             # current maintainers don't know why it was there in the first
1793             # place.
1794             #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1795 2076         5882 while( $line =~
1796             # Sort of adapted from Text::Tabs.
1797 189         1253 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1798             * $tab_width)
1799             -(length($1) % $tab_width)))/e
1800             ) {}
1801              
1802             # TODO: whinge about (or otherwise treat) unindented or overlong lines
1803              
1804             }
1805             }
1806             }
1807              
1808             # Now the VerbatimFormatted hoodoo...
1809 534 100 66     2251 if( $self->{'accept_codes'} and
    100          
1810             $self->{'accept_codes'}{'VerbatimFormatted'}
1811             ) {
1812 103   100     443 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
  76         285  
1813             # Kill any number of terminal newlines
1814 103         293 $self->_verbatim_format($para);
1815             } elsif ($self->{'codes_in_verbatim'}) {
1816             push @$para,
1817 2         10 @{$self->_make_treelet(
1818             join("\n", splice(@$para, 2)),
1819 2         22 $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1820             )};
1821 2         8 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1822             } else {
1823 429 100       1939 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1824 429         2691 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1825             }
1826 534         1134 return;
1827             }
1828              
1829             sub _ponder_Data {
1830 54     54   113 my ($self,$para) = @_;
1831 54         76 DEBUG and print STDERR " giving data treatment...\n";
1832 54         107 $para->[1]{'xml:space'} = 'preserve';
1833 54 100       164 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1834 54         102 return;
1835             }
1836              
1837              
1838              
1839              
1840             ###########################################################################
1841              
1842             sub _traverse_treelet_bit { # for use only by the routine above
1843 8010     8010   17045 my($self, $name) = splice @_,0,2;
1844              
1845 8010         10339 my $scratch;
1846 8010         24844 $self->_handle_element_start(($scratch=$name), shift @_);
1847              
1848 8010         17130 while (@_) {
1849 12846         19857 my $x = shift;
1850 12846 100       21384 if (ref($x)) {
1851 2881         5926 &_traverse_treelet_bit($self, @$x);
1852             } else {
1853 9965   100     25237 $x .= shift while @_ && !ref($_[0]);
1854 9965         20973 $self->_handle_text($x);
1855             }
1856             }
1857              
1858 8010         23415 $self->_handle_element_end($scratch=$name);
1859 8010         21314 return;
1860             }
1861              
1862             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1863              
1864             sub _closers_for_all_curr_open {
1865 10     10   23 my $self = $_[0];
1866 10         18 my @closers;
1867 10 50       16 foreach my $still_open (@{ $self->{'curr_open'} || return }) {
  10         73  
1868 15         43 my @copy = @$still_open;
1869 15         26 $copy[1] = {%{ $copy[1] }};
  15         79  
1870             #$copy[1]{'start_line'} = -1;
1871 15 100       63 if($copy[0] eq '=for') {
    50          
1872 9         18 $copy[0] = '=end';
1873             } elsif($copy[0] eq '=over') {
1874             $self->whine(
1875             $still_open->[1]{start_line} ,
1876 6         43 "=over without closing =back"
1877             );
1878              
1879 6         12 $copy[0] = '=back';
1880             } else {
1881 0         0 die "I don't know how to auto-close an open $copy[0] region";
1882             }
1883              
1884 15 50       55 unless( @copy > 2 ) {
1885 15         37 push @copy, $copy[1]{'target'};
1886 15 100       46 $copy[-1] = '' unless defined $copy[-1];
1887             # since =over's don't have targets
1888             }
1889              
1890 15         29 $copy[1]{'fake-closer'} = 1;
1891              
1892 15         18 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1893 15         60 unshift @closers, \@copy;
1894             }
1895 10         37 return @closers;
1896             }
1897              
1898             #--------------------------------------------------------------------------
1899              
1900             sub _verbatim_format {
1901 103     103   212 my($it, $p) = @_;
1902              
1903 103         121 my $formatting;
1904              
1905 103         241 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1906 559         664 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1907 559         1283 $p->[$i] .= "\n";
1908             # Unlike with simple Verbatim blocks, we don't end up just doing
1909             # a join("\n", ...) on the contents, so we have to append a
1910             # newline to every line, and then nix the last one later.
1911             }
1912              
1913 103         153 if( DEBUG > 4 ) {
1914             print STDERR "<<\n";
1915             for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1916             print STDERR "_verbatim_format $i: $p->[$i]";
1917             }
1918             print STDERR ">>\n";
1919             }
1920              
1921 103         301 for(my $i = $#$p; $i > 2; $i--) {
1922             # work backwards over the lines, except the first (#2)
1923              
1924             #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1925             # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1926             # look at a formatty line preceding a nonformatty one
1927 455         532 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1928 455 100       747 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1929 11         14 DEBUG > 5 and print STDERR " It's a formatty line. ",
1930             "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1931              
1932 11 100       24 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1933 1         2 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1934 1         3 next;
1935             } else {
1936 10         13 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1937             }
1938             } else {
1939 444         553 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1940 444         782 next;
1941             }
1942              
1943             # A formatty line has to have #: in the first two columns, and uses
1944             # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1945             # Example:
1946             # What do you want? i like pie. [or whatever]
1947             # #:^^^^^^^^^^^^^^^^^ /////////////
1948              
1949              
1950 10         14 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1951              
1952 10         27 $formatting = ' ' . $1;
1953 10         33 $formatting =~ s/\s+$//s; # nix trailing whitespace
1954 10 50 33     47 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1955 0         0 splice @$p,$i,1; # remove this line
1956 0         0 $i--; # don't consider next line
1957 0         0 next;
1958             }
1959              
1960 10 100       24 if( length($formatting) >= length($p->[$i-1]) ) {
1961 3         8 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1962             } else {
1963 7         25 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1964             }
1965             # Make $formatting and the previous line be exactly the same length,
1966             # with $formatting having a " " as the last character.
1967              
1968 10         14 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1969              
1970              
1971 10         16 my @new_line;
1972 10         44 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1973             #print STDERR "Format matches $1\n";
1974              
1975 54 100       120 if($2) {
1976             #print STDERR "SKIPPING <$2>\n";
1977 32         139 push @new_line,
1978             substr($p->[$i-1], pos($formatting)-length($1), length($1));
1979             } else {
1980             #print STDERR "SNARING $+\n";
1981 22 50       157 push @new_line, [
    100          
    100          
1982             (
1983             $3 ? 'VerbatimB' :
1984             $4 ? 'VerbatimI' :
1985             $5 ? 'VerbatimBI' : die("Should never get called")
1986             ), {},
1987             substr($p->[$i-1], pos($formatting)-length($1), length($1))
1988             ];
1989             #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1990             }
1991             }
1992 10         40 my @nixed =
1993             splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1994 10         15 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1995              
1996 10         14 DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1997             scalar(@new_line), "):",
1998             map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1999 10         31 $i--; # So the next line we scrutinize is the line before the one
2000             # that we just went and formatted
2001             }
2002              
2003 103         195 $p->[0] = 'VerbatimFormatted';
2004              
2005             # Collapse adjacent text nodes, just for kicks.
2006 103         271 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2007 0 0 0     0 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2008 0         0 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2009 0         0 $p->[$i] .= splice @$p, $i+1, 1; # merge
2010 0         0 --$i; # and back up
2011             }
2012             }
2013              
2014             # Now look for the last text token, and remove the terminal newline
2015 103         286 for( my $i = $#$p; $i >= 2; $i-- ) {
2016             # work backwards over the tokens, even the first
2017 103 50       236 if( !ref($p->[$i]) ) {
2018 103 50       537 if($p->[$i] =~ s/\n$//s) {
2019 103         223 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2020             } else {
2021 0         0 DEBUG > 5 and print STDERR
2022             "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2023             }
2024 103         209 last; # we only want the next one
2025             }
2026             }
2027              
2028 103         202 return;
2029             }
2030              
2031              
2032             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2033              
2034              
2035             sub _treelet_from_formatting_codes {
2036             # Given a paragraph, returns a treelet. Full of scary tokenizing code.
2037             # Like [ '~Top', {'start_line' => $start_line},
2038             # "I like ",
2039             # [ 'B', {}, "pie" ],
2040             # "!"
2041             # ]
2042             # This illustrates the general format of a treelet. It is an array:
2043             # [0] is a scalar indicating its type. In the example above, the
2044             # types are '~Top' and 'B'
2045             # [1] is a hash of various flags about it, possibly empty
2046             # [2] - [N] are an ordered list of the subcomponents of the treelet.
2047             # Scalars are literal text, refs are sub-treelets, to
2048             # arbitrary levels. Stringifying a treelet will recursively
2049             # stringify the sub-treelets, concatentating everything
2050             # together to form the exact text of the treelet.
2051              
2052 4444     4444   9790 my($self, $para, $start_line, $preserve_space) = @_;
2053              
2054 4444         13114 my $treelet = ['~Top', {'start_line' => $start_line},];
2055              
2056 4444 100 100     15852 unless ($preserve_space || $self->{'preserve_whitespace'}) {
2057 3577         28119 $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2058 3577         6766 $para =~ s/ $//;
2059 3577         5257 $para =~ s/^ //;
2060             }
2061              
2062             # Only apparent problem the above code is that N<< >> turns into
2063             # N<< >>. But then, word wrapping does that too! So don't do that!
2064              
2065              
2066             # As a Start-code is encountered, the number of opening bracket '<'
2067             # characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2068             # etc). When closing brackets are found in the text, at least this number
2069             # (plus the 1) will be required to mean the Start-code is terminated. When
2070             # those are found, @stack is popped.
2071 4444         6308 my @stack;
2072              
2073 4444         8098 my @lineage = ($treelet);
2074 4444         6615 my $raw = ''; # raw content of L<> fcode before splitting/processing
2075             # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2076             # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2077             # the 'collapse and trim all whitespace first' lines just above.
2078 4444         6063 my $inL = 0;
2079              
2080 4444         5748 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2081              
2082             # Here begins our frightening tokenizer RE. The following regex matches
2083             # text in four main parts:
2084             #
2085             # * Start-codes. The first alternative matches C< or C<<, the latter
2086             # followed by some whitespace. $1 will hold the entire start code
2087             # (including any space following a multiple-angle-bracket delimiter),
2088             # and $2 will hold only the additional brackets past the first in a
2089             # multiple-bracket delimiter. length($2) + 1 will be the number of
2090             # closing brackets we have to find.
2091             #
2092             # * Closing brackets. Match some amount of whitespace followed by
2093             # multiple close brackets. The logic to see if this closes anything
2094             # is down below. Note that in order to parse C<< >> correctly, we
2095             # have to use look-behind (?<=\s\s), since the match of the starting
2096             # code will have consumed the whitespace.
2097             #
2098             # * A single closing bracket, to close a simple code like C<>.
2099             #
2100             # * Something that isn't a start or end code. We have to be careful
2101             # about accepting whitespace, since perlpodspec says that any whitespace
2102             # before a multiple-bracket closing delimiter should be ignored.
2103             #
2104 4444         43781 while($para =~
2105             m/\G
2106             (?:
2107             # Match starting codes, including the whitespace following a
2108             # multiple-delimiter start code. $1 gets the whole start code and
2109             # $2 gets all but one of the
2110             ([A-Z]<(?:(<+)\s+)?)
2111             |
2112             # Match multiple-bracket end codes. $3 gets the whitespace that
2113             # should be discarded before an end bracket but kept in other cases
2114             # and $4 gets the end brackets themselves. ($3 can be empty if the
2115             # construct is empty, like C<< >>, and all the white-space has been
2116             # gobbled up already, considered to be space after the opening
2117             # bracket. In this case we use look-behind to verify that there are
2118             # at least 2 spaces in a row before the ">".)
2119             (\s+|(?<=\s\s))(>{2,})
2120             |
2121             (\s?>) # $5: simple end-codes
2122             |
2123             ( # $6: stuff containing no start-codes or end-codes
2124             (?:
2125             [^A-Z\s>]
2126             |
2127             (?:
2128             [A-Z](?!<)
2129             )
2130             |
2131             # whitespace is ok, but we don't want to eat the whitespace before
2132             # a multiple-bracket end code.
2133             # NOTE: we may still have problems with e.g. S<< >>
2134             (?:
2135             \s(?!\s*>{2,})
2136             )
2137             )+
2138             )
2139             )
2140             /xgo
2141             ) {
2142 15883         21863 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2143 15883 100       46732 if(defined $1) {
    100          
    100          
    50          
2144 3078         4179 my $bracket_count; # How many '<<<' in a row this has. Needed for
2145             # Pod::Simple::JustPod
2146 3078 100       5301 if(defined $2) {
2147 115         171 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2148 115         254 $bracket_count = length($2) + 1;
2149 115         212 push @stack, $bracket_count; # length of the necessary complex
2150             # end-code string
2151             } else {
2152 2963         3547 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2153 2963         4307 push @stack, 0; # signal that we're looking for simple
2154 2963         3935 $bracket_count = 1;
2155             }
2156 3078         6282 my $code = substr($1,0,1);
2157 3078 100       5357 if ('L' eq $code) {
2158 433 100       781 if ($inL) {
2159 1         2 $raw .= $1;
2160 1         44 $self->scream( $start_line,
2161             'Nested L<> are illegal. Pretending inner one is '
2162             . 'X<...> so can continue looking for other errors.');
2163 1         2 $code = "X";
2164             }
2165             else {
2166 432         631 $raw = ""; # reset raw content accumulator
2167 432         667 $inL = @stack;
2168             }
2169             } else {
2170 2645 100       4659 $raw .= $1 if $inL;
2171             }
2172 3078         6776 push @lineage, [ $code, {}, ]; # new node object
2173              
2174             # Tell Pod::Simple::JustPod how many brackets there were, but to save
2175             # space, not in the most usual case of there was just 1. It can be
2176             # inferred by the absence of this element. Similarly, if there is more
2177             # than one bracket, extract the white space between the final bracket
2178             # and the real beginning of the interior. Save that if it isn't just a
2179             # single space
2180 3078 100 100     7794 if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2181 18         48 $lineage[-1][1]{'~bracket_count'} = $bracket_count;
2182 18         43 my $lspacer = substr($1, 1 + $bracket_count);
2183 18 100       50 $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2184             }
2185 3078         3938 push @{ $lineage[-2] }, $lineage[-1];
  3078         16802  
2186             } elsif(defined $4) {
2187 126         187 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2188             # This is where it gets messy...
2189 126 100       564 if(! @stack) {
    100          
    100          
    50          
2190             # We saw " >>>>" but needed nothing. This is ALL just stuff then.
2191 1         4 DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2192 1         8 push @{ $lineage[-1] }, $3, $4;
  1         3  
2193 1         11 next;
2194             } elsif(!$stack[-1]) {
2195             # We saw " >>>>" but needed only ">". Back pos up.
2196 3         5 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2197 3         5 push @{ $lineage[-1] }, $3; # That was a for-real space, too.
  3         7  
2198 3         12 pos($para) = pos($para) - length($4) + 1;
2199             } elsif($stack[-1] == length($4)) {
2200             # We found " >>>>", and it was exactly what we needed. Commonest case.
2201 115         174 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2202             } elsif($stack[-1] < length($4)) {
2203             # We saw " >>>>" but needed only " >>". Back pos up.
2204 0         0 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2205 0         0 pos($para) = pos($para) - length($4) + $stack[-1];
2206             } else {
2207             # We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2208 7         12 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2209 7         14 push @{ $lineage[-1] }, $3, $4;
  7         26  
2210 7         37 next;
2211             }
2212             #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2213              
2214 118 50 66     306 if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2215 3 100       29 if ($3 ne "") {
    100          
2216 1         3 $lineage[-1][1]{'~rspacer'} = $3;
2217             }
2218             elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2219              
2220             # Here we had something like C<< >> which was a false positive
2221 1         3 delete $lineage[-1][1]{'~lspacer'};
2222             }
2223             else {
2224             $lineage[-1][1]{'~rspacer'}
2225 1         15 = substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2226 1         3 chop $lineage[-1][1]{'~lspacer'};
2227             }
2228             }
2229              
2230 118 100       235 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  2         5  
  118         256  
2231             # Keep the element from being childless
2232              
2233 118 100       244 if ($inL == @stack) {
2234 22         54 $lineage[-1][1]{'raw'} = $raw;
2235 22         32 $inL = 0;
2236             }
2237              
2238 118         181 pop @stack;
2239 118         152 pop @lineage;
2240              
2241 118 100       646 $raw .= $3.$4 if $inL;
2242              
2243             } elsif(defined $5) {
2244 3056         3821 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2245              
2246 3056 100 100     9836 if(@stack and ! $stack[-1]) {
2247             # We're indeed expecting a simple end-code
2248 2952         3758 DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2249              
2250 2952 50       5726 if(length($5) == 2) { # There was a space there: " >"
    100          
2251 0         0 push @{ $lineage[-1] }, ' ';
  0         0  
2252 2952         5879 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2253 41         64 push @{ $lineage[-1] }, ''; # keep it from being really childless
  41         109  
2254             }
2255              
2256 2952 100       6001 if ($inL == @stack) {
2257 410         995 $lineage[-1][1]{'raw'} = $raw;
2258 410         554 $inL = 0;
2259             }
2260              
2261 2952         4074 pop @stack;
2262 2952         3983 pop @lineage;
2263             } else {
2264 104         170 DEBUG > 4 and print STDERR " It's just stuff.\n";
2265 104         197 push @{ $lineage[-1] }, $5;
  104         273  
2266             }
2267              
2268 3056 100       21772 $raw .= $5 if $inL;
2269              
2270             } elsif(defined $6) {
2271 9623         11775 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2272 9623         12129 push @{ $lineage[-1] }, $6;
  9623         24960  
2273 9623 100       35449 $raw .= $6 if $inL;
2274             # XXX does not capture multiplace whitespaces -- 'raw' ends up with
2275             # at most 1 leading/trailing whitespace, why not all of it?
2276             # Answer, because we deliberately trimmed it above
2277              
2278             } else {
2279             # should never ever ever ever happen
2280 0         0 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2281 0         0 die "SPORK 512512!";
2282             }
2283             }
2284              
2285 4444 100       8701 if(@stack) { # Uhoh, some sequences weren't closed.
2286 8         38 my $x= "...";
2287 8         20 while(@stack) {
2288 8 50       14 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  0         0  
  8         22  
2289             # Hmmmmm!
2290              
2291 8         20 my $code = (pop @lineage)->[0];
2292 8         42 my $ender_length = pop @stack;
2293 8 50       18 if($ender_length) {
2294 0         0 --$ender_length;
2295 0         0 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2296             } else {
2297 8         29 $x = $code . "<$x>";
2298             }
2299             }
2300 8         12 DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2301 8         39 $self->whine($start_line,
2302             "Unterminated $x sequence",
2303             );
2304             }
2305              
2306 4444         13291 return $treelet;
2307             }
2308              
2309             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2310              
2311             sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2312 0     0 0 0 return stringify_lol($_[1]);
2313             }
2314              
2315             sub stringify_lol { # function: stringify_lol($lol)
2316 2587     2587 0 3865 my $string_form = '';
2317 2587         5470 _stringify_lol( $_[0] => \$string_form );
2318 2587         7461 return $string_form;
2319             }
2320              
2321             sub _stringify_lol { # the real recursor
2322 2967     2967   4646 my($lol, $to) = @_;
2323 2967         5765 for(my $i = 2; $i < @$lol; ++$i) {
2324 3963 100 100     9457 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
      66        
2325 380         667 _stringify_lol( $lol->[$i], $to); # recurse!
2326             } else {
2327 3583         7879 $$to .= $lol->[$i];
2328             }
2329             }
2330 2967         4596 return;
2331             }
2332              
2333             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2334              
2335             sub _dump_curr_open { # return a string representation of the stack
2336 3     3   7 my $curr_open = $_[0]{'curr_open'};
2337              
2338 3 50       6 return '[empty]' unless @$curr_open;
2339             return join '; ',
2340 3         6 map {;
2341             ($_->[0] eq '=for')
2342             ? ( ($_->[1]{'~really'} || '=over')
2343 4 50 50     45 . ' ' . $_->[1]{'target'})
2344             : $_->[0]
2345             }
2346             @$curr_open
2347             ;
2348             }
2349              
2350             ###########################################################################
2351             my %pretty_form = (
2352             "\a" => '\a', # ding!
2353             "\b" => '\b', # BS
2354             "\e" => '\e', # ESC
2355             "\f" => '\f', # FF
2356             "\t" => '\t', # tab
2357             "\cm" => '\cm',
2358             "\cj" => '\cj',
2359             "\n" => '\n', # probably overrides one of either \cm or \cj
2360             '"' => '\"',
2361             '\\' => '\\\\',
2362             '$' => '\\$',
2363             '@' => '\\@',
2364             '%' => '\\%',
2365             '#' => '\\#',
2366             );
2367              
2368             sub pretty { # adopted from Class::Classless
2369             # Not the most brilliant routine, but passable.
2370             # Don't give it a cyclic data structure!
2371 502     502 0 5726 my @stuff = @_; # copy
2372 502         667 my $x;
2373             my $out =
2374             # join ",\n" .
2375             join ", ",
2376 502         778 map {;
2377 592 50 100     4392 if(!defined($_)) {
    100 66        
    50 33        
    100          
    100          
    100          
2378 0         0 "undef";
2379             } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2380 58         146 $x = "[ " . pretty(@$_) . " ]" ;
2381 58         148 $x;
2382             } elsif(ref($_) eq 'SCALAR') {
2383 0         0 $x = "\\" . pretty($$_) ;
2384 0         0 $x;
2385             } elsif(ref($_) eq 'HASH') {
2386 48         86 my $hr = $_;
2387             $x = "{" . join(", ",
2388 48         470 map(pretty($_) . '=>' . pretty($hr->{$_}),
2389             sort keys %$hr ) ) . "}" ;
2390 48         176 $x;
2391 2         5 } elsif(!length($_)) { q{''} # empty string
2392             } elsif(
2393             $_ eq '0' # very common case
2394             or(
2395             m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2396             and $_ ne '-0' # the strange case that RE lets thru
2397             )
2398 26         82 ) { $_;
2399             } else {
2400             # Yes, explicitly name every character desired. There are shorcuts one
2401             # could make, but I (Karl Williamson) was afraid that some Perl
2402             # releases would have bugs in some of them. For example [A-Z] works
2403             # even on EBCDIC platforms to match exactly the 26 uppercase English
2404             # letters, but I don't know if it has always worked without bugs. It
2405             # seemed safest just to list the characters.
2406             # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2407 458 0       1058 s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
  0         0  
2408             <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2409 458         1379 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2410             qq{"$_"};
2411             }
2412             } @stuff;
2413 502         2401 # $out =~ s/\n */ /g if length($out) < 75;
2414             return $out;
2415             }
2416              
2417             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2418              
2419             # A rather unsubtle method of blowing away all the state information
2420             # from a parser object so it can be reused. Provided as a utility for
2421             # backward compatibility in Pod::Man, etc. but not recommended for
2422             # general use.
2423              
2424 5     5 0 10348 sub reinit {
2425 5         20 my $self = shift;
2426             foreach (qw(source_dead source_filename doc_has_started
2427             start_of_pod_block content_seen last_was_blank paras curr_open
2428             line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2429             Title)) {
2430 80         142  
2431             delete $self->{$_};
2432             }
2433             }
2434              
2435             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2436             1;
2437