File Coverage

blib/lib/Pod/Simple/BlackBox.pm
Criterion Covered Total %
statement 854 1006 84.8
branch 368 526 69.9
condition 132 189 69.8
subroutine 59 65 90.7
pod 0 7 0.0
total 1413 1793 78.8


line stmt bran cond sub pod time code
1             package Pod::Simple::BlackBox;
2             #
3             # "What's in the box?" "Pain."
4             #
5             ###########################################################################
6             #
7             # This is where all the scary things happen: parsing lines into
8             # paragraphs; and then into directives, verbatims, and then also
9             # turning formatting sequences into treelets.
10             #
11             # Are you really sure you want to read this code?
12             #
13             #-----------------------------------------------------------------------------
14             #
15             # The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16             # of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17             # to call the proper callbacks on the treelets.
18             #
19             # Every node in a treelet is a ['name', {attrhash}, ...children...]
20              
21 69     69   2327 use integer; # vroom!
  69         142  
  69         511  
22 69     69   2127 use strict;
  69         122  
  69         1260  
23 69     69   291 use Carp ();
  69         313  
  69         1573  
24 69     69   303 use vars qw($VERSION );
  69         105  
  69         14151  
25             $VERSION = '3.43';
26             #use constant DEBUG => 7;
27              
28             sub my_qr ($$) {
29              
30             # $1 is a pattern to compile and return. Older perls compile any
31             # syntactically valid property, even if it isn't legal. To cope with
32             # this, return an empty string unless the compiled pattern also
33             # successfully matches $2, which the caller furnishes.
34              
35 450     450 0 1257 my ($input_re, $should_match) = @_;
36             # XXX could have a third parameter $shouldnt_match for extra safety
37              
38 450 50       3333 my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39              
40 67     67   1188 my $re = eval "no warnings; $use_utf8 qr/$input_re/";
  67     67   178  
  67     67   3455  
  67     67   1059  
  67     67   191  
  67     67   4206  
  67     67   45293  
  67     67   2304  
  67     67   1074  
  67     67   1660  
  67     37   198  
  67     0   2129  
  67     0   407  
  67     0   128  
  67         2201  
  67         424  
  67         118  
  67         2276  
  67         420  
  67         126  
  67         2374  
  67         412  
  67         145  
  67         2196  
  67         405  
  67         143  
  67         2125  
  67         489  
  67         138  
  67         3917  
  450         26395  
  3         18  
  3         8  
  4         37  
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
41             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
42 450 50       1797 return "" if $@;
43              
44 67     67   6344 my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
  67     67   170  
  67     67   2257  
  67     67   646  
  67     67   136  
  67     67   3947  
  67     67   454  
  67     67   138  
  67     67   2371  
  67     747   404  
  67     0   129  
  67     3   2097  
  67     0   452  
  67         168  
  67         2794  
  67         409  
  67         134  
  67         2266  
  67         430  
  67         140  
  67         2493  
  67         432  
  67         128  
  67         2331  
  67         394  
  67         140  
  67         2709  
  450         24894  
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
45             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
46 450 50       1796 return "" if $@;
47              
48             #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49 450 50       1653 return $re if $matches;
50              
51             #print STDERR __LINE__, ": $re: didn't match\n";
52 1         2 return "";
53             }
54              
55             BEGIN {
56 69     69   1620 require Pod::Simple;
57 69 50       5764 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58             }
59              
60             # Matches a character iff the character will have a different meaning
61             # if we choose CP1252 vs UTF-8 if there is no =encoding line.
62             # This is broken for early Perls on non-ASCII platforms.
63             my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64             $non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65              
66             # Use patterns understandable by Perl 5.6, if possible
67 69     69   535 my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
  69         157  
  69         64154  
68             my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69             # to get assigned
70             my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71             "\x{250}");
72             $rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73              
74 67     67   388 my $script_run_re = eval 'no warnings "experimental::script_run";
  67         125  
  67         15939  
75             qr/(*script_run: ^ .* $ )/x';
76             my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77             unless ($latin_re) {
78             # This was machine generated to be the ranges of the union of the above
79             # three properties, with things that were undefined by Unicode 4.1 filling
80             # gaps. That is the version in use when Perl advanced enough to
81             # successfully compile and execute the above pattern.
82             $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83             }
84              
85             my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86              
87             # Latin script code points not in the first release of Unicode
88             my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89              
90             # If this perl doesn't have the Deprecated property, there's only one code
91             # point in it that we need be concerned with.
92             my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93             $deprecated_re = qr/\x{149}/ unless $deprecated_re;
94              
95             my $utf8_bom;
96             if (($] ge 5.007_003)) {
97             $utf8_bom = "\x{FEFF}";
98             utf8::encode($utf8_bom);
99             } else {
100             $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101             }
102              
103             # This is used so that the 'content_seen' method doesn't return true on a
104             # file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105             # there is a valid =foo line will we return that content was seen.
106             my $seen_legal_directive = 0;
107              
108             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109              
110 1     1 0 41 sub parse_line { shift->parse_lines(@_) } # alias
111              
112             # - - - Turn back now! Run away! - - -
113              
114             sub parse_lines { # Usage: $parser->parse_lines(@lines)
115             # an undef means end-of-stream
116 7636     7636 0 10792 my $self = shift;
117              
118 7636         10587 my $code_handler = $self->{'code_handler'};
119 7636         9695 my $cut_handler = $self->{'cut_handler'};
120 7636         9985 my $wl_handler = $self->{'whiteline_handler'};
121 7636   100     14259 $self->{'line_count'} ||= 0;
122              
123 7636         8664 my $scratch;
124              
125             DEBUG > 4 and
126 7636         8365 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127              
128 7636         8501 DEBUG > 5 and
129             print STDERR "# About to parse lines: ",
130             join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131              
132 7636   100     15162 my $paras = ($self->{'paras'} ||= []);
133             # paragraph buffer. Because we need to defer processing of =over
134             # directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135             # to process this.
136              
137 7636   100     14377 $self->{'pod_para_count'} ||= 0;
138              
139             # An attempt to match the pod portions of a line. This is not fool proof,
140             # but is good enough to serve as part of the heuristic for guessing the pod
141             # encoding if not specified.
142 7636         9213 my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
  73544         135809  
  7636         38303  
143 7636         35498 my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x;
144              
145 7636         11156 my $line;
146 7801         11953 foreach my $source_line (@_) {
147 16149 50       30584 if( $self->{'source_dead'} ) {
148 166         10695 DEBUG > 4 and print STDERR "# Source is dead.\n";
149 166         702 last;
150             }
151              
152 16149 100       39380 unless( defined $source_line ) {
153 833         1510 DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154              
155 833         2644 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156 667         1616 push @$paras, $paras->[-1], $paras->[-1];
157             # So that it definitely fills the buffer.
158 667         1149 $self->{'source_dead'} = 1;
159 2503         4629 $self->_ponder_paragraph_buffer;
160 2503         4018 next;
161             }
162              
163              
164 17152 100       29651 if( $self->{'line_count'}++ ) {
165 16483         32736 ($line = $source_line) =~ tr/\n\r//d;
166             # If we don't have two vars, we'll end up with that there
167             # tr/// modding the (potentially read-only) original source line!
168              
169             } else {
170 2505         4772 DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171              
172 2505 50       7097 if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
    50          
    50          
173 1836         2348 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174 1836         2215 $self->_handle_encoding_line( "=encoding utf8" );
175 1836         4147 delete $self->{'_processed_encoding'};
176 1836         4052 $line =~ tr/\n\r//d;
177              
178             } elsif( $line =~ s/^\xFE\xFF//s ) {
179 1836         2626 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180             $self->scream(
181 17138         36307 $self->{'line_count'},
182             "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183             );
184 1836         9890 splice @_;
185 1836         10462 push @_, undef;
186 1836         3152 next;
187              
188             # TODO: implement somehow?
189              
190             } elsif( $line =~ s/^\xFF\xFE//s ) {
191 1836         3226 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192             $self->scream(
193 3035         6335 $self->{'line_count'},
194             "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195             );
196 0         0 splice @_;
197 0         0 push @_, undef;
198 3035         6397 next;
199              
200             # TODO: implement somehow?
201              
202             } else {
203 890         1259 DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204 890         2451 ($line = $source_line) =~ tr/\n\r//d;
205             }
206             }
207              
208 15537 100 100     109044 if(!$self->{'parse_characters'} && !$self->{'encoding'}
      100        
      100        
      100        
209             && ($self->{'in_pod'} || $line =~ /^=/s)
210             && $line =~ /$non_ascii_re/
211             ) {
212              
213 231         510 my $encoding;
214              
215             # No =encoding line, and we are at the first pod line in the input that
216             # contains a non-ascii byte, that is, one whose meaning varies depending
217             # on whether the file is encoded in UTF-8 or CP1252, which are the two
218             # possibilities permitted by the pod spec. (ASCII is assumed if the
219             # file only contains ASCII bytes.) In order to process this line, we
220             # need to figure out what encoding we will use for the file.
221             #
222             # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223             # 160-255, but it is used here, as it often colloquially is, to refer to
224             # the complete set of code points 0-255, including ASCII (0-127), the C1
225             # controls (128-159), and strict Latin 1 (160-255).
226             #
227             # CP1252 is effectively a superset of Latin 1, because it differs only
228             # from colloquial 8859-1 in the C1 controls, which are very unlikely to
229             # actually be present in 8859-1 files, so can be used for other purposes
230             # without conflict. CP 1252 uses most of them for graphic characters.
231             #
232             # Note that all ASCII-range bytes represent their corresponding code
233             # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234             # code points require multiple (non-ASCII) bytes to represent. (A
235             # separate paragraph for EBCDIC is below.) The multi-byte
236             # representation is quite structured. If we find an isolated byte that
237             # would require multiple bytes to represent in UTF-8, we know that the
238             # encoding is not UTF-8. If we find a sequence of bytes that violates
239             # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240             # hence must be 1252.
241             #
242             # But there are ambiguous cases where we could guess wrong. If so, the
243             # user will end up having to supply an =encoding line. We use all
244             # readily available information to improve our chances of guessing
245             # right. The odds of something not being UTF-8, but still passing a
246             # UTF-8 validity test go down very rapidly with increasing length of the
247             # sequence. Therefore we look at all non-ascii sequences on the line.
248             # If any of the sequences can't be UTF-8, we quit there and choose
249             # CP1252. If all could be UTF-8, we see if any of the code points
250             # represented are unlikely to be in pod. If so, we guess CP1252. If
251             # not, we check if the line is all in the same script; if not guess
252             # CP1252; otherwise UTF-8. For perls that don't have convenient script
253             # run testing, see if there is both Latin and non-Latin. If so, CP1252,
254             # otherwise UTF-8.
255             #
256             # On EBCDIC platforms, the situation is somewhat different. In
257             # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258             # but so do the bytes that are for the C1 controls. Recall that these
259             # correspond to the unused portion of 8859-1 that 1252 mostly takes
260             # over. That means that there are fewer code points that are
261             # represented by multi-bytes. But, note that the these controls are
262             # very unlikely to be in pod text. So if we encounter one of them, it
263             # means that it is quite likely CP1252 and not UTF-8. The net result is
264             # the same code below is used for both platforms.
265             #
266             # XXX probably if the line has E that evaluates to illegal CP1252,
267             # then it is UTF-8. But we haven't processed E<> yet.
268              
269 231 50       771 goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270              
271 231         425 my $copy;
272              
273 69     69   606 no warnings 'utf8';
  69         148  
  69         5698  
274              
275 2824 50       5478 if ($] ge 5.007_003) {
276 2609         5631 $copy = $line;
277              
278             # On perls that have this function, we can use it to easily see if the
279             # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280             # needed below for script run detection
281 225 100       622 goto set_1252 if ! utf8::decode($copy);
282             }
283             elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284             # code page doing here anyway?
285             goto set_utf8;
286             }
287             else { # ASCII, no decode(): do it ourselves using the fundamental
288             # characteristics of UTF-8
289 69     69   45894 use if $] le 5.006002, 'utf8';
  69         905  
  69         1625  
290              
291 215         1343 my $char_ord;
292             my $needed; # How many continuation bytes to gobble up
293              
294             # Initialize the translated line with a dummy character that will be
295             # deleted after everything else is done. This dummy makes sure that
296             # $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297             # with upgrading in the middle
298 0         0 $copy = chr(0x100);
299              
300             # Parse through the line
301 0         0 for (my $i = 0; $i < length $line; $i++) {
302 0         0 my $byte = substr($line, $i, 1);
303              
304             # ASCII bytes are trivially dealt with
305 0 0       0 if ($byte !~ $non_ascii_re) {
306 0         0 $copy .= $byte;
307 0         0 next;
308             }
309              
310 0         0 my $b_ord = ord $byte;
311              
312             # Now figure out what this code point would be if the input is
313             # actually in UTF-8. If, in the process, we discover that it isn't
314             # well-formed UTF-8, we guess CP1252.
315             #
316             # Start the process. If it is UTF-8, we are at the first, start
317             # byte, of a multi-byte sequence. We look at this byte to figure
318             # out how many continuation bytes are needed, and to initialize the
319             # code point accumulator with the data from this byte.
320             #
321             # Normally the minimum continuation byte is 0x80, but in certain
322             # instances the minimum is a higher number. So the code below
323             # overrides this for those instances.
324 0         0 my $min_cont = 0x80;
325              
326 0 0       0 if ($b_ord < 0xC2) { # A start byte < C2 is malformed
    0          
    0          
    0          
327 0         0 goto set_1252;
328             }
329             elsif ($b_ord <= 0xDF) {
330 0         0 $needed = 1;
331 0         0 $char_ord = $b_ord & 0x1F;
332             }
333             elsif ($b_ord <= 0xEF) {
334 0 0       0 $min_cont = 0xA0 if $b_ord == 0xE0;
335 0         0 $needed = 2;
336 215         338 $char_ord = $b_ord & (0x1F >> 1);
337             }
338             elsif ($b_ord <= 0xF4) {
339 215 0       581 $min_cont = 0x90 if $b_ord == 0xF0;
340 2814         17630 $needed = 3;
341 11         20 $char_ord = $b_ord & (0x1F >> 2);
342             }
343             else { # F4 is the highest start byte for legal Unicode; higher is
344             # unlikely to be in pod.
345 11         61 goto set_1252;
346             }
347              
348             # ? not enough continuation bytes available
349 11 0       18 goto set_1252 if $i + $needed >= length $line;
350              
351             # Accumulate the ordinal of the character from the remaining
352             # (continuation) bytes.
353 11         39 while ($needed-- > 0) {
354 11         19 my $cont = substr($line, ++$i, 1);
355 11         142 $b_ord = ord $cont;
356 0 0 0     0 goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF;
357              
358             # In all cases, any next continuation bytes all have the same
359             # minimum legal value
360 0         0 $min_cont = 0x80;
361              
362             # Accumulate this byte's contribution to the code point
363 0         0 $char_ord <<= 6;
364 0         0 $char_ord |= ($b_ord & 0x3F);
365             }
366              
367             # Here, the sequence that formed this code point was valid UTF-8,
368             # so add the completed character to the output
369 0         0 $copy .= chr $char_ord;
370             } # End of loop through line
371              
372             # Delete the dummy first character
373 0         0 $copy = substr($copy, 1);
374             }
375              
376             # Here, $copy is legal UTF-8.
377              
378             # If it can't be legal CP1252, no need to look further. (These bytes
379             # aren't valid in CP1252.) This test could have been placed higher in
380             # the code, but it seemed wrong to set the encoding to UTF-8 without
381             # making sure that the very first instance is well-formed. But what if
382             # it isn't legal CP1252 either? We have to choose one or the other, and
383             # It seems safer to favor the single-byte encoding over the multi-byte.
384 0 50       0 goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385              
386             # The C1 controls are not likely to appear in pod
387 0 50       0 goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388              
389             # Nor are surrogates nor unassigned, nor deprecated.
390 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391 0 50 33     0 goto set_1252 if $cs_re && $copy =~ $cs_re;
392 0 50 33     0 DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393 0 50 33     0 goto set_1252 if $cn_re && $copy =~ $cn_re;
394 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395 0 50       0 goto set_1252 if $copy =~ $deprecated_re;
396              
397             # Nor are rare code points. But this is hard to determine. khw
398             # believes that IPA characters and the modifier letters are unlikely to
399             # be in pod (and certainly very unlikely to be the in the first line in
400             # the pod containing non-ASCII)
401 0 100       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402 0 100 66     0 goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403              
404             # The first Unicode version included essentially every Latin character
405             # in modern usage. So, a Latin character not in the first release will
406             # unlikely be in pod.
407 0 50 33     0 DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408 0 50 33     0 goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409              
410             # On perls that handle script runs, if the UTF-8 interpretation yields
411             # a single script, we guess UTF-8, otherwise just having a mixture of
412             # scripts is suspicious, so guess CP1252. We first strip off, as best
413             # we can, the ASCII characters that look like they are pod directives,
414             # as these would always show as mixed with non-Latin text.
415 0         0 $copy =~ s/$pod_chars_re//g;
416              
417 0 50       0 if ($script_run_re) {
418 0 0       0 goto set_utf8 if $copy =~ $script_run_re;
419 0         0 DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420 0         0 goto set_1252;
421             }
422              
423             # Even without script runs, but on recent enough perls and Unicodes, we
424             # can check if there is a mixture of both Latin and non-Latin. Again,
425             # having a mixture of scripts is suspicious, so assume CP1252
426              
427             # If it's all non-Latin, there is no CP1252, as that is Latin
428             # characters and punct, etc.
429 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430 0 50       0 goto set_utf8 if $copy !~ $latin_re;
431              
432 0 100       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433 0 100       0 goto set_utf8 if $copy =~ $every_char_is_latin_re;
434              
435 0         0 DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436              
437 10         15 set_1252:
438             DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439 10         22 $encoding = 'CP1252';
440 16         52 goto done_set;
441              
442 6         23 set_utf8:
443             DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444 6         39 $encoding = 'UTF-8';
445              
446 16         128 done_set:
447             $self->_handle_encoding_line( "=encoding $encoding" );
448 16         88 delete $self->{'_processed_encoding'};
449 16 50       89 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450              
451 16         222 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
452              
453             $self->whine(
454 16         112 $self->{'line_count'},
455             "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456             );
457             }
458              
459 15322         20634 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460              
461 15322 100       25774 if(!$self->{'in_pod'}) {
462 1274 100       3423 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
463 735 100       1979 if($1 eq 'cut') {
464             $self->scream(
465 9         107 $self->{'line_count'},
466             "=cut found outside a pod block. Skipping to next block."
467             );
468              
469             ## Before there were errata sections in the world, it was
470             ## least-pessimal to abort processing the file. But now we can
471             ## just barrel on thru (but still not start a pod block).
472             #splice @_;
473             #push @_, undef;
474              
475 9         31 next;
476             } else {
477             $self->{'in_pod'} = $self->{'start_of_pod_block'}
478 726         2047 = $self->{'last_was_blank'} = 1;
479             # And fall thru to the pod-mode block further down
480             }
481             } else {
482 539         589 DEBUG > 5 and print STDERR "# It's a code-line.\n";
483 539 100       893 $code_handler->(map $_, $line, $self->{'line_count'}, $self)
484             if $code_handler;
485             # Note: this may cause code to be processed out of order relative
486             # to pods, but in order relative to cuts.
487              
488             # Note also that we haven't yet applied the transcoding to $line
489             # by time we call $code_handler!
490              
491 544 50       1097 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
492             # That RE is from perlsyn, section "Plain Old Comments (Not!)",
493             #$fname = $2 if defined $2;
494             #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495 5         33 DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496 5         49 $self->{'line_count'} = $1 - 1;
497             }
498              
499 544         934 next;
500             }
501             }
502              
503             # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504             # Else we're in pod mode:
505              
506             # Apply any necessary transcoding:
507 14774 100       25251 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508              
509             # HERE WE CATCH =encoding EARLY!
510 14780 100       25452 if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511 14 100       76 next if $self->parse_characters; # Ignore this line
512 14         68 $line = $self->_handle_encoding_line( $line );
513             }
514              
515 14777 100       50357 if($line =~ m/^=cut/s) {
    100          
    100          
516             # here ends the pod block, and therefore the previous pod para
517 123         176 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518 130         361 $self->{'in_pod'} = 0;
519             # ++$self->{'pod_para_count'};
520 130         397 $self->_ponder_paragraph_buffer();
521             # by now it's safe to consider the previous paragraph as done.
522 130         213 DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523 130 100       506 $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524             if $cut_handler;
525              
526             # TODO: add to docs: Note: this may cause cuts to be processed out
527             # of order relative to pods, but in order relative to code.
528              
529             } elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530 4670 100 66     17503 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531 2817 100       3982 $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532             if $wl_handler;
533             }
534              
535 7473 100 66     26414 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      100        
536 951         1930 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537 732         1385 push @{$paras->[-1]}, $line;
  523         1192  
538             } # otherwise it's not interesting
539              
540 4659 100 100     14809 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541 4827         6555 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542             }
543              
544 4878         7918 $self->{'last_was_blank'} = 1;
545              
546             } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547              
548 5410 100       16381 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) {
    100          
549             # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550 2554         10871 my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551 2335 100 100     9282 $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552             # Note that in "=head1 foo", the WS is lost.
553             # Example: ['=head1', {'start_line' => 123}, ' foo']
554              
555 2335         3711 ++$self->{'pod_para_count'};
556              
557 2554         6876 $self->_ponder_paragraph_buffer();
558             # by now it's safe to consider the previous paragraph as done.
559              
560 4930         10646 push @$paras, $new; # the new incipient paragraph
561 4930         8704 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562              
563             } elsif($line =~ m/^\s/s) {
564              
565 653 100 33     3408 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      66        
566 169         298 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567 2732         10117 push @{$paras->[-1]}, $line;
  184         413  
568             } else {
569 529         839 ++$self->{'pod_para_count'};
570 529         1457 $self->_ponder_paragraph_buffer();
571             # by now it's safe to consider the previous paragraph as done.
572 529         782 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573 529         2087 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574             }
575             } else {
576 3304         8056 ++$self->{'pod_para_count'};
577 2256         5623 $self->_ponder_paragraph_buffer();
578             # by now it's safe to consider the previous paragraph as done.
579 3304         13528 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580 2343         3712 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581             }
582 5299         11171 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583              
584             } else {
585             # It's a non-blank line /continuing/ the current para
586 4912 50       8756 if(@$paras) {
587 5873         9326 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588 5819         7419 push @{$paras->[-1]}, $line;
  5873         11880  
589             } else {
590             # Unexpected case!
591 1146         4056 die "Continuing a paragraph but \@\$paras is empty?";
592             }
593 5322         11566 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594             }
595              
596             } # ends the big while loop
597              
598 8153         10861 DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599 8153         37576 return $self;
600             }
601              
602             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603              
604             sub _handle_encoding_line {
605 535     18   1599 my($self, $line) = @_;
606              
607 535 50       1186 return if $self->parse_characters;
608              
609             # The point of this routine is to set $self->{'_transcoder'} as indicated.
610              
611 535 50       878 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612 111         532 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613              
614 52         98 my $e = $1;
615 52         92 my $orig = $e;
616 52         114 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
  76         172  
617              
618 76         222 my $enc_error;
619              
620             # Cf. perldoc Encode and perldoc Encode::Supported
621              
622 76         4715 require Pod::Simple::Transcode;
623              
624 76 100 33     355 if( $self->{'encoding'} ) {
    50          
    100          
625 534         838 my $norm_current = $self->{'encoding'};
626 534         1431 my $norm_e = $e;
627 534         2054 foreach my $that ($norm_current, $norm_e) {
628 534         855 $that = lc($that);
629 1146         2504 $that =~ s/[-_]//g;
630             }
631 333 100       672 if($norm_current eq $norm_e) {
632 333         414 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633             "redundant. ($norm_current eq $norm_e). Ignoring.\n";
634 333         415 $enc_error = '';
635             # But that doesn't necessarily mean that the earlier one went okay
636             } else {
637 333         773 $enc_error = "Encoding is already set to " . $self->{'encoding'};
638 0         0 DEBUG > 1 and print STDERR $enc_error;
639             }
640             } elsif (
641             # OK, let's turn on the encoding
642             do {
643 350         816 DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644 1853         2401 $self->{'encoding'} = $e;
645 1853         9708 1;
646             }
647             and $e eq 'HACKRAW'
648             ) {
649 42         119 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650              
651             } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652              
653             die($enc_error = "WHAT? _transcoder is already set?!")
654 59 50       6232 if $self->{'_transcoder'}; # should never happen
655 59         381 require Pod::Simple::Transcode;
656 59         168 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657 59         150 eval {
658 59         124 my @x = ('', "abc", "123");
659 59         127 $self->{'_transcoder'}->(@x);
660             };
661 59 50       252 $@ && die( $enc_error =
662             "Really unexpected error setting up encoding $e: $@\nAborting"
663             );
664 59         144 $self->{'detected_encoding'} = $e;
665              
666             } else {
667 42         3411 my @supported = Pod::Simple::Transcode::->all_encodings;
668              
669             # Note unsupported, and complain
670 42         172 DEBUG and print STDERR " Encoding [$e] is unsupported.",
671             "\nSupporteds: @supported\n";
672 5         11 my $suggestion = '';
673              
674             # Look for a near match:
675 5         10 my $norm = lc($e);
676 5         11 $norm =~ tr[-_][]d;
677 10         25 my $n;
678 10         45 foreach my $enc (@supported) {
679 5         17 $n = lc($enc);
680 3         5 $n =~ tr[-_][]d;
681 3 50       7 next unless $n eq $norm;
682 2         6 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683 2         4 last;
684             }
685 37         54 my $encmodver = Pod::Simple::Transcode::->encmodver;
686 37         225 $enc_error = join '' =>
687             "This document probably does not appear as it should, because its ",
688             "\"=encoding $e\" line calls for an unsupported encoding.",
689             $suggestion, " [$encmodver\'s supported encodings are: @supported]"
690             ;
691              
692 37         417 $self->scream( $self->{'line_count'}, $enc_error );
693             }
694 17         36 push @{ $self->{'encoding_command_statuses'} }, $enc_error;
  52         34649  
695 52 100       225 if (defined($self->{'_processed_encoding'})) {
696             # Double declaration.
697 35         167 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698             }
699 52         126 $self->{'_processed_encoding'} = $orig;
700              
701 52         153 return $line;
702             }
703              
704             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705              
706             sub _handle_encoding_second_level {
707             # By time this is called, the encoding (if well formed) will already
708             # have been acted on.
709 42     8   115 my($self, $para) = @_;
710 42         127 my @x = @$para;
711 42         110 my $content = join ' ', splice @x, 2;
712 9         1979 $content =~ s/^\s+//s;
713 9         2040 $content =~ s/\s+$//s;
714              
715 9         27 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716              
717 9 100       31 if (defined($self->{'_processed_encoding'})) {
718             #if($content ne $self->{'_processed_encoding'}) {
719             # Could it happen?
720             #}
721 9         51 delete $self->{'_processed_encoding'};
722             # It's already been handled. Check for errors.
723 9 50       50 if(! $self->{'encoding_command_statuses'} ) {
    100          
724 2         7 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725             } elsif( $self->{'encoding_command_statuses'}[-1] ) {
726             $self->whine( $para->[1]{'start_line'},
727             sprintf "Couldn't do %s: %s",
728             $self->{'encoding_command_reqs' }[-1],
729 248         326 $self->{'encoding_command_statuses'}[-1],
730             );
731             } else {
732 255         326 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733             }
734              
735             } else {
736             # Otherwise it's a syntax error
737 248         421 $self->whine( $para->[1]{'start_line'},
738             "Invalid =encoding syntax: $content"
739             );
740             }
741              
742 7         17 return;
743             }
744              
745             #~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746              
747             {
748             my $m = -321; # magic line number
749              
750             sub _gen_errata {
751 667     668   1037 my $self = $_[0];
752             # Return 0 or more fake-o paragraphs explaining the accumulated
753             # errors on this document.
754              
755 669 100 66     2335 return() unless $self->{'errata'} and keys %{$self->{'errata'}};
  16         105  
756              
757 16         51 my @out;
758              
759 56         99 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
  67         195  
  56         194  
760             push @out,
761             ['=item', {'start_line' => $m}, "Around line $line:"],
762             map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763             #['~Top', {'start_line' => $m},
764             $_
765             #]
766             ],
767 33         140 @{$self->{'errata'}{$line}}
  74         293  
768             )
769             ;
770             }
771              
772             # TODO: report of unknown entities? unrenderable characters?
773              
774 56         257 unshift @out,
775             ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776             ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777             "Hey! ",
778             ['B', {},
779             'The above document had some coding errors, which are explained below:'
780             ]
781             ],
782             ['=over', {'start_line' => $m, 'errata' => 1}, ''],
783             ;
784              
785 45         142 push @out,
786             ['=back', {'start_line' => $m, 'errata' => 1}, ''],
787             ;
788              
789 45         131 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790              
791 45         159 return @out;
792             }
793              
794             }
795              
796             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797              
798             ##############################################################################
799             ##
800             ## stop reading now stop reading now stop reading now stop reading now stop
801             ##
802             ## HERE IT BECOMES REALLY SCARY
803             ##
804             ## stop reading now stop reading now stop reading now stop reading now stop
805             ##
806             ##############################################################################
807              
808             sub _ponder_paragraph_buffer {
809              
810             # Para-token types as found in the buffer.
811             # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812             # =over, =back, =item
813             # and the null =pod (to be complained about if over one line)
814             #
815             # "~data" paragraphs are something we generate at this level, depending on
816             # a currently open =over region
817              
818             # Events fired: Begin and end for:
819             # directivename (like head1 .. head4), item, extend,
820             # for (from =begin...=end, =for),
821             # over-bullet, over-number, over-text, over-block,
822             # item-bullet, item-number, item-text,
823             # Document,
824             # Data, Para, Verbatim
825             # B, C, longdirname (TODO -- wha?), etc. for all directives
826             #
827              
828 5870     5840   8057 my $self = $_[0];
829 5870         7314 my $paras;
830 5870 100       7021 return unless @{$paras = $self->{'paras'}};
  5870         13115  
831 5146   100     12719 my $curr_open = ($self->{'curr_open'} ||= []);
832              
833 5146         6400 my $scratch;
834              
835 5116         6366 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836              
837             # We have something in our buffer. So apparently the document has started.
838 5120 100       9297 unless($self->{'doc_has_started'}) {
839 707         1738 $self->{'doc_has_started'} = 1;
840              
841 682         1010 my $starting_contentless;
842 712   66     4035 $starting_contentless =
843             (
844             !@$curr_open
845             and @$paras and ! grep $_->[0] ne '~end', @$paras
846             # i.e., if the paras is all ~ends
847             )
848             ;
849 902         1296 DEBUG and print STDERR "# Starting ",
850             $starting_contentless ? 'contentless' : 'contentful',
851             " document\n"
852             ;
853              
854             $self->_handle_element_start(
855             ($scratch = 'Document'),
856             {
857 902 100       4309 'start_line' => $paras->[0][1]{'start_line'},
858             $starting_contentless ? ( 'contentless' => 1 ) : (),
859             },
860             );
861             }
862              
863 5142         8474 my($para, $para_type);
864 5142         9616 while(@$paras) {
865              
866             # If a directive, assume it's legal; subtract below if found not to be
867 7341 100       20226 $seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868              
869             last if @$paras == 1
870             and ( $paras->[0][0] eq '=over'
871             or $paras->[0][0] eq '=item'
872 7316 100 100     31558 or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
      100        
873             # Those're the three kinds of paragraphs that require lookahead.
874             # Actually, an "=item Foo" inside an region
875             # and any =item inside an region (rare)
876             # don't require any lookahead, but all others (bullets
877             # and numbers) do.
878             # The verbatim is different from the other two, because those might be
879             # like:
880             #
881             # =item
882             # ...
883             # =cut
884             # ...
885             # =item
886             #
887             # The =cut here finishes the paragraph but doesn't terminate the =over
888             # they should be in. (khw apologizes that he didn't comment at the time
889             # why the 'in_pod' works, and no longer remembers why, and doesn't think
890             # it is currently worth the effort to re-figure it out.)
891              
892             # TODO: whinge about many kinds of directives in non-resolving =for regions?
893             # TODO: many? like what? =head1 etc?
894              
895 5943         11063 $para = shift @$paras;
896 5944         9960 $para_type = $para->[0];
897              
898 5944         7389 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899             $self->_dump_curr_open(), ")\n";
900              
901 5943 100       18217 if($para_type eq '=for') {
    100          
    100          
    100          
902 74 50       235 next if $self->_ponder_for($para,$curr_open,$paras);
903              
904             } elsif($para_type eq '=begin') {
905 83 50       209 next if $self->_ponder_begin($para,$curr_open,$paras);
906              
907             } elsif($para_type eq '=end') {
908 83 50       280 next if $self->_ponder_end($para,$curr_open,$paras);
909              
910             } elsif($para_type eq '~end') { # The virtual end-document signal
911 2065 50       4114 next if $self->_ponder_doc_end($para,$curr_open,$paras);
912             }
913              
914              
915             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917 6446 100       14596 if(grep $_->[1]{'~ignore'}, @$curr_open) {
918 1398         1897 DEBUG > 1 and
919             print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920 1398         3514 next;
921             }
922             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924              
925 6217 100       16735 if($para_type eq '=pod') {
    100          
    100          
926 1648         2930 $self->_ponder_pod($para,$curr_open,$paras);
927              
928             } elsif($para_type eq '=over') {
929 1325 100       2179 next if $self->_ponder_over($para,$curr_open,$paras);
930              
931             } elsif($para_type eq '=back') {
932 1325 100       2853 next if $self->_ponder_back($para,$curr_open,$paras);
933              
934             } else {
935              
936             # All non-magical codes!!!
937              
938             # Here we start using $para_type for our own twisted purposes, to
939             # mean how it should get treated, not as what the element name
940             # should be.
941              
942 4478         5888 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943              
944 4478         5562 my $i;
945              
946             # Enforce some =headN discipline
947 4478 100 66     12883 if($para_type =~ m/^=head\d$/s
      100        
      66        
948             and ! $self->{'accept_heads_anywhere'}
949             and @$curr_open
950             and $curr_open->[-1][0] eq '=over'
951             ) {
952 221         359 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953             $self->whine(
954 221         1207 $para->[1]{'start_line'},
955             "You forgot a '=back' before '$para_type'"
956             );
957 1169         2094 unshift @$paras, ['=back', {}, ''], $para; # close the =over
958 1169         2322 next;
959             }
960              
961              
962 5917 100 66     21284 if($para_type eq '=item') {
    100          
    100          
    100          
    100          
    100          
    100          
963              
964 2628         8811 my $over;
965 2511 50 33     5662 unless(@$curr_open and
966 2619         7007 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967             $self->whine(
968 1543         2084 $para->[1]{'start_line'},
969             "'=item' outside of any '=over'"
970             );
971             unshift @$paras,
972 1543         4997 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973             $para
974             ;
975 2         13 next;
976             }
977              
978              
979 1038         2045 my $over_type = $over->[1]{'~type'};
980              
981 1041 50       3558 if(!$over_type) {
    50          
    100          
    100          
    50          
982             # Shouldn't happen1
983             die "Typeless over in stack, starting at line "
984 251         752 . $over->[1]{'start_line'};
985              
986             } elsif($over_type eq 'block') {
987 1147 0       2837 unless($curr_open->[-1][1]{'~bitched_about'}) {
988 22         31 $curr_open->[-1][1]{'~bitched_about'} = 1;
989             $self->whine(
990             $curr_open->[-1][1]{'start_line'},
991             "You can't have =items (as at line "
992 22         47 . $para->[1]{'start_line'}
993             . ") unless the first thing after the =over is an =item"
994             );
995             }
996             # Just turn it into a paragraph and reconsider it
997 1125         2783 $para->[0] = '~Para';
998 98         298 unshift @$paras, $para;
999 57         250 next;
1000              
1001             } elsif($over_type eq 'text') {
1002 834         2732 my $item_type = $self->_get_item_type($para);
1003             # That kills the content of the item if it's a number or bullet.
1004 1698         2354 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005              
1006 1698 100 66     3034 if($item_type eq 'text') {
    50          
1007             # Nothing special needs doing for 'text'
1008             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009             $self->whine(
1010 921         2985 $para->[1]{'start_line'},
1011             "Expected text after =item, not a $item_type"
1012             );
1013             # Undo our clobbering:
1014 10         17 push @$para, $para->[1]{'~orig_content'};
1015 10         56 delete $para->[1]{'number'};
1016             # Only a PROPER item-number element is allowed
1017             # to have a number attribute.
1018             } else {
1019 6         18 die "Unhandled item type $item_type"; # should never happen
1020             }
1021              
1022             # =item-text thingies don't need any assimilation, it seems.
1023              
1024             } elsif($over_type eq 'number') {
1025 34         94 my $item_type = $self->_get_item_type($para);
1026             # That kills the content of the item if it's a number or bullet.
1027 939         4137 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028              
1029 82         147 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030              
1031 82 50       330 if($item_type eq 'bullet') {
    50          
    50          
    50          
1032             # Hm, it's not numeric. Correct for this.
1033 54         283 $para->[1]{'number'} = $expected_value;
1034             $self->whine(
1035 0         0 $para->[1]{'start_line'},
1036             "Expected '=item $expected_value'"
1037             );
1038 0         0 push @$para, $para->[1]{'~orig_content'};
1039             # restore the bullet, blocking the assimilation of next para
1040              
1041             } elsif($item_type eq 'text') {
1042             # Hm, it's not numeric. Correct for this.
1043 0         0 $para->[1]{'number'} = $expected_value;
1044             $self->whine(
1045 54         131 $para->[1]{'start_line'},
1046             "Expected '=item $expected_value'"
1047             );
1048             # Text content will still be there and will block next ~Para
1049              
1050             } elsif($item_type ne 'number') {
1051 54         227 die "Unknown item type $item_type"; # should never happen
1052              
1053             } elsif($expected_value == $para->[1]{'number'}) {
1054 28         36 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055              
1056             } else {
1057 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058             " instead of the expected value of $expected_value\n";
1059             $self->whine(
1060             $para->[1]{'start_line'},
1061 0         0 "You have '=item " . $para->[1]{'number'} .
1062             "' instead of the expected '=item $expected_value'"
1063             );
1064 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1065             }
1066              
1067 28 50       71 if(@$para == 2) {
1068             # For the cases where we /didn't/ push to @$para
1069 28 100       68 if($paras->[0][0] eq '~Para') {
1070 25         31 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071 79         212 push @$para, splice @{shift @$paras},2;
  79         173  
1072             } else {
1073 57         163 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074 3         14 push @$para, ''; # Just so it's not contentless
1075             }
1076             }
1077              
1078              
1079             } elsif($over_type eq 'bullet') {
1080 159         522 my $item_type = $self->_get_item_type($para);
1081             # That kills the content of the item if it's a number or bullet.
1082 159         281 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083              
1084 159 50       371 if($item_type eq 'bullet') {
    0          
    0          
1085             # as expected!
1086              
1087 159 100       384 if( $para->[1]{'~_freaky_para_hack'} ) {
1088 101         127 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089 101         214 push @$para, $para->[1]{'~_freaky_para_hack'};
1090             }
1091              
1092             } elsif($item_type eq 'number') {
1093             $self->whine(
1094 0         0 $para->[1]{'start_line'},
1095             "Expected '=item *'"
1096             );
1097 0         0 push @$para, $para->[1]{'~orig_content'};
1098             # and block assimilation of the next paragraph
1099 0         0 delete $para->[1]{'number'};
1100             # Only a PROPER item-number element is allowed
1101             # to have a number attribute.
1102             } elsif($item_type eq 'text') {
1103             $self->whine(
1104 0         0 $para->[1]{'start_line'},
1105             "Expected '=item *'"
1106             );
1107             # But doesn't need processing. But it'll block assimilation
1108             # of the next para.
1109             } else {
1110 0         0 die "Unhandled item type $item_type"; # should never happen
1111             }
1112              
1113 159 100       398 if(@$para == 2) {
1114             # For the cases where we /didn't/ push to @$para
1115 58 50       138 if($paras->[0][0] eq '~Para') {
1116 58         72 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117 58         87 push @$para, splice @{shift @$paras},2;
  58         142  
1118             } else {
1119 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120 0         0 push @$para, ''; # Just so it's not contentless
1121             }
1122             }
1123              
1124             } else {
1125 0         0 die "Unhandled =over type \"$over_type\"?";
1126             # Shouldn't happen!
1127             }
1128              
1129 968         1688 $para_type = 'Plain';
1130 968         2560 $para->[0] .= '-' . $over_type;
1131             # Whew. Now fall thru and process it.
1132              
1133              
1134             } elsif($para_type eq '=extend') {
1135             # Well, might as well implement it here.
1136 0         0 $self->_ponder_extend($para);
1137 0         0 next; # and skip
1138             } elsif($para_type eq '=encoding') {
1139             # Not actually acted on here, but we catch errors here.
1140 7         68 $self->_handle_encoding_second_level($para);
1141 7 100       39 next unless $self->keep_encoding_directive;
1142 4         9 $para_type = 'Plain';
1143             } elsif($para_type eq '~Verbatim') {
1144 483         965 $para->[0] = 'Verbatim';
1145 483         826 $para_type = '?Verbatim';
1146             } elsif($para_type eq '~Para') {
1147 2198         3698 $para->[0] = 'Para';
1148 2198         3269 $para_type = '?Plain';
1149             } elsif($para_type eq 'Data') {
1150 28         49 $para->[0] = 'Data';
1151 28         44 $para_type = '?Data';
1152             } elsif( $para_type =~ s/^=//s
1153             and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154             ) {
1155 565         856 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156             } else {
1157             # An unknown directive!
1158 8         19 $seen_legal_directive--;
1159             DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160 8         10 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161             ;
1162             $self->whine(
1163 8         57 $para->[1]{'start_line'},
1164             "Unknown directive: $para->[0]"
1165             );
1166              
1167             # And maybe treat it as text instead of just letting it go?
1168 8         24 next;
1169             }
1170              
1171 4246 100       14950 if($para_type =~ s/^\?//s) {
1172 2709 100       5777 if(! @$curr_open) { # usual case
1173 1610         2154 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174             } else {
1175 1099         2931 my @fors = grep $_->[0] eq '=for', @$curr_open;
1176             DEBUG > 1 and print STDERR "Containing fors: ",
1177 1099         1418 join(',', map $_->[1]{'target'}, @fors), "\n";
1178              
1179 1099 100       2265 if(! @fors) {
    100          
1180 1111         1831 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181              
1182             #} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183             #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184             } elsif( $fors[-1][1]{'~resolve'} ) {
1185             # Look to the immediately containing for
1186              
1187 74 100       213 if($para_type eq 'Data') {
1188 38         97 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189 38         92 $para->[0] = 'Para';
1190 48         185 $para_type = 'Plain';
1191             } else {
1192 34         128 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193             }
1194             } else {
1195 53         89 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196 75         201 $para->[0] = $para_type = 'Data';
1197             }
1198             }
1199             }
1200              
1201             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202 4299 100       8353 if($para_type eq 'Plain') {
    100          
    50          
1203 4317         9943 $self->_ponder_Plain($para);
1204             } elsif($para_type eq 'Verbatim') {
1205 1055         2552 $self->_ponder_Verbatim($para);
1206             } elsif($para_type eq 'Data') {
1207 24         99 $self->_ponder_Data($para);
1208             } else {
1209 2         4 die "\$para type is $para_type -- how did that happen?";
1210             # Shouldn't happen.
1211             }
1212              
1213             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214 4418         11852 $para->[0] =~ s/^[~=]//s;
1215              
1216 4250         5632 DEBUG and print STDERR "\n", pretty($para), "\n";
1217              
1218             # traverse the treelet (which might well be just one string scalar)
1219             $self->{'content_seen'} ||= 1 if $seen_legal_directive
1220 4250 100 100     18344 && ! $self->{'~tried_gen_errata'};
      100        
1221 4250         9880 $self->_traverse_treelet_bit(@$para);
1222             }
1223             }
1224              
1225 5120         13873 return;
1226             }
1227              
1228             ###########################################################################
1229             # The sub-ponderers...
1230              
1231              
1232              
1233             sub _ponder_for {
1234 934     49   3502 my ($self,$para,$curr_open,$paras) = @_;
1235              
1236             # Fake it out as a begin/end
1237 677         1495 my $target;
1238              
1239 502 50       773 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240 175         548 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241 175         251 return 1;
1242             }
1243              
1244 223         564 for(my $i = 2; $i < @$para; ++$i) {
1245 141 50       388 if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1246 101         217 $target = $1;
1247 49         81 last;
1248             }
1249             }
1250 49 50       109 unless(defined $target) {
1251             $self->whine(
1252 1         2 $para->[1]{'start_line'},
1253             "=for without a target?"
1254             );
1255 52         82 return 1;
1256             }
1257 77         97 DEBUG > 1 and
1258             print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259              
1260 77         154 $para->[0] = 'Data';
1261              
1262             unshift @$paras,
1263             ['=begin',
1264             {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265             $target,
1266             ],
1267             $para,
1268             ['=end',
1269 934         2084 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270             $target,
1271             ],
1272             ;
1273              
1274 851         2399 return 1;
1275             }
1276              
1277             sub _ponder_begin {
1278 108     223   278 my ($self,$para,$curr_open,$paras) = @_;
1279 89         264 my $content = join ' ', splice @$para, 2;
1280 57         133 $content =~ s/^\s+//s;
1281 943         2641 $content =~ s/\s+$//s;
1282 943 50       1479 unless(length($content)) {
1283             $self->whine(
1284 886         4299 $para->[1]{'start_line'},
1285             "=begin without a target?"
1286             );
1287 886         2151 DEBUG and print STDERR "Ignoring targetless =begin\n";
1288 1169         3261 return 1;
1289             }
1290              
1291 59         259 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1292 59 100       151 $para->[1]{'title'} = $title if ($title);
1293 59         119 $para->[1]{'target'} = $target; # without any ':'
1294 57         90 $content = $target; # strip off the title
1295              
1296 57         105 $content =~ s/^:!/!:/s;
1297 59         90 my $neg; # whether this is a negation-match
1298 59 100       147 $neg = 1 if $content =~ s/^!//s;
1299 59         90 my $to_resolve; # whether to process formatting codes
1300 59 100       157 $to_resolve = 1 if $content =~ s/^://s;
1301              
1302 59         86 my $dont_ignore; # whether this target matches us
1303              
1304 57 100       206 foreach my $target_name (
1305             split(',', $content, -1),
1306             $neg ? () : '*'
1307             ) {
1308 116         151 DEBUG > 2 and
1309             print STDERR " Considering whether =begin $content matches $target_name\n";
1310 118 100       266 next unless $self->{'accept_targets'}{$target_name};
1311              
1312 32         43 DEBUG > 2 and
1313             print STDERR " It DOES match the acceptable target $target_name!\n";
1314             $to_resolve = 1
1315 32 100       111 if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316 32         51 $dont_ignore = 1;
1317 100         199 $para->[1]{'target_matching'} = $target_name;
1318 100         237 last; # stop looking at other target names
1319             }
1320              
1321 127 100       293 if($neg) {
1322 89 100       195 if( $dont_ignore ) {
1323 76         162 $dont_ignore = '';
1324 6         12 delete $para->[1]{'target_matching'};
1325 6         19 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326             } else {
1327 13         24 $dont_ignore = 1;
1328 83         312 $para->[1]{'target_matching'} = '!';
1329 83         185 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330             }
1331             }
1332              
1333 127         240 $para->[0] = '=for'; # Just what we happen to call these, internally
1334 127   100     237 $para->[1]{'~really'} ||= '=begin';
1335 127   100     382 $para->[1]{'~ignore'} = (! $dont_ignore) || 0;
1336 127   100     262 $para->[1]{'~resolve'} = $to_resolve || 0;
1337              
1338 127         220 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339             "ignore contents of this region\n";
1340 127         177 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341             ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342 127         276 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343              
1344 127         209 push @$curr_open, $para;
1345 127 100 100     523 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346 153         212 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347             } else {
1348 170 50 100     451 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1349 73         198 $self->_handle_element_start((my $scratch='for'), $para->[1]);
1350             }
1351              
1352 93         274 return 1;
1353             }
1354              
1355             sub _ponder_end {
1356 93     57   169 my ($self,$para,$curr_open,$paras) = @_;
1357 93         218 my $content = join ' ', splice @$para, 2;
1358 93         197 $content =~ s/^\s+//s;
1359 127         269 $content =~ s/\s+$//s;
1360 71         97 DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361              
1362 57 50       155 unless(length($content)) {
1363             $self->whine(
1364             $para->[1]{'start_line'},
1365             "'=end' without a target?" . (
1366             ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367 0 0 0     0 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368             : ''
1369             )
1370             );
1371 0         0 DEBUG and print STDERR "Ignoring targetless =end\n";
1372 14         20 return 1;
1373             }
1374              
1375 71 50       230 unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376             $self->whine(
1377 14         17 $para->[1]{'start_line'},
1378             "'=end $content' is invalid. (Stack: "
1379             . $self->_dump_curr_open() . ')'
1380             );
1381 70         107 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382 70         283 return 1;
1383             }
1384              
1385 127 50 33     448 unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386             $self->whine(
1387 70         179 $para->[1]{'start_line'},
1388             "=end $content without matching =begin. (Stack: "
1389             . $self->_dump_curr_open() . ')'
1390             );
1391 70         83 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392 70         86 return 1;
1393             }
1394              
1395 127 100       246 unless($content eq $curr_open->[-1][1]{'target'}) {
1396             $self->whine(
1397             $para->[1]{'start_line'},
1398             "=end $content doesn't match =begin "
1399 70         140 . $curr_open->[-1][1]{'target'}
1400             . ". (Stack: "
1401             . $self->_dump_curr_open() . ')'
1402             );
1403 70         270 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404 20         29 return 1;
1405             }
1406              
1407             # Else it's okay to close...
1408 107 100       328 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409 70         196 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410             # And that may be because of this to-be-closed =for region, or some
1411             # other one, but it doesn't matter.
1412             } else {
1413 107         338 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414             # what's that for?
1415              
1416 110 50 50     284 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1417 110         312 $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418             }
1419 130         299 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420 130         245 pop @$curr_open;
1421              
1422 130         365 return 1;
1423             }
1424              
1425             sub _ponder_doc_end {
1426 760     2523   1625 my ($self,$para,$curr_open,$paras) = @_;
1427 687 100       1381 if(@$curr_open) { # Deal with things left open
1428 6         11 DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429             $self->_dump_curr_open(), ")\n";
1430              
1431 6         8 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432 79         267 unshift @$paras, $self->_closers_for_all_curr_open;
1433             # Make sure there is exactly one ~end in the parastack, at the end:
1434 6         26 @$paras = grep $_->[0] ne '~end', @$paras;
1435 6         11 push @$paras, $para, $para;
1436             # We need two -- once for the next cycle where we
1437             # generate errata, and then another to be at the end
1438             # when that loop back around to process the errata.
1439 6         24 return 1;
1440              
1441             } else {
1442 754         1200 DEBUG and print STDERR "Okay, stack is empty now.\n";
1443             }
1444              
1445             # Try generating errata section, if applicable
1446 681 100       1602 unless($self->{'~tried_gen_errata'}) {
1447 667         1312 $self->{'~tried_gen_errata'} = 1;
1448 667         1824 my @extras = $self->_gen_errata();
1449 740 100       1707 if(@extras) {
1450 17         57 unshift @$paras, @extras;
1451 17         32 DEBUG and print STDERR "Generated errata... relooping...\n";
1452 17         79 return 1; # I.e., loop around again to process these fake-o paragraphs
1453             }
1454             }
1455              
1456 737         1579 splice @$paras; # Well, that's that for this paragraph buffer.
1457 687         948 DEBUG and print STDERR "Throwing end-document event.\n";
1458              
1459 717         2010 $self->_handle_element_end( my $scratch = 'Document' );
1460 717         2345 return 1; # Hasta la byebye
1461             }
1462              
1463             sub _ponder_pod {
1464 529     521   1255 my ($self,$para,$curr_open,$paras) = @_;
1465             $self->whine(
1466 549 50       1182 $para->[1]{'start_line'},
1467             "=pod directives shouldn't be over one line long! Ignoring all "
1468             . (@$para - 2) . " lines of content"
1469             ) if @$para > 3;
1470              
1471             # Content ignored unless 'pod_handler' is set
1472 549 100       1220 if (my $pod_handler = $self->{'pod_handler'}) {
1473 70         328 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474 251 100       568 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475 251         496 $pod_handler->($line, $line_num, $self);
1476             }
1477              
1478             # The surrounding methods set content_seen, so let us remain consistent.
1479             # I do not know why it was not here before -- should it not be here?
1480             # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1481              
1482 483         1144 return;
1483             }
1484              
1485             sub _ponder_over {
1486 160     187   530 my ($self,$para,$curr_open,$paras) = @_;
1487 160 50       451 return 1 unless @$paras;
1488 160         302 my $list_type;
1489              
1490 160 100       479 if($paras->[0][0] eq '=item') { # most common case
    100          
    50          
1491 147         692 $list_type = $self->_get_initial_item_type($paras->[0]);
1492              
1493             } elsif($paras->[0][0] eq '=back') {
1494             # Ignore empty lists by default
1495 247 100       345 if ($self->{'parse_empty_lists'}) {
1496 247         595 $list_type = 'empty';
1497             } else {
1498 221         444 shift @$paras;
1499 221         584 return 1;
1500             }
1501             } elsif($paras->[0][0] eq '~end') {
1502             $self->whine(
1503 221         573 $para->[1]{'start_line'},
1504             "=over is the last thing in the document?!"
1505             );
1506 26         76 return 1; # But feh, ignore it.
1507             } else {
1508 39         64 $list_type = 'block';
1509             }
1510 182         649 $para->[1]{'~type'} = $list_type;
1511 377         788 push @$curr_open, $para;
1512             # yes, we reuse the paragraph as a stack item
1513              
1514 377         769 my $content = join ' ', splice @$para, 2;
1515 377         1137 $para->[1]{'~orig_content'} = $content;
1516 377         1063 my $overness;
1517 254 100       1322 if($content =~ m/^\s*$/s) {
    50          
1518 164         371 $para->[1]{'indent'} = 4;
1519             } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1520 69     67   459645 no integer;
  67         183  
  67         600  
1521 188         535 $para->[1]{'indent'} = $1;
1522 96 50       462 if($1 == 0) {
1523             $self->whine(
1524 6         23 $para->[1]{'start_line'},
1525             "Can't have a 0 in =over $content"
1526             );
1527 6         16 $para->[1]{'indent'} = 4;
1528             }
1529             } else {
1530             $self->whine(
1531 98         261 $para->[1]{'start_line'},
1532             "=over should be: '=over' or '=over positive_number'"
1533             );
1534 57         147 $para->[1]{'indent'} = 4;
1535             }
1536 213         383 DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537              
1538 213 100 100     780 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1539 213         983 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540              
1541 198         811 return;
1542             }
1543              
1544             sub _ponder_back {
1545 162     377   435 my ($self,$para,$curr_open,$paras) = @_;
1546             # TODO: fire off or or ??
1547              
1548 158         455 my $content = join ' ', splice @$para, 2;
1549 160 50       473 if($content =~ m/\S/) {
1550             $self->whine(
1551 4         20 $para->[1]{'start_line'},
1552             "=back doesn't take any parameters, but you said =back $content"
1553             );
1554             }
1555              
1556 156 50 33     786 if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557 156         230 DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558             # Expected case: we're closing the most recently opened thing
1559             #my $over = pop @$curr_open;
1560 165 100 50     644 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1561             $self->_handle_element_end( my $scratch =
1562 209         943 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563             );
1564             } else {
1565 53         122 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566             join(', ', map $_->[0], @$curr_open), ").\n";
1567             $self->whine(
1568 53         149 $para->[1]{'start_line'},
1569             '=back without =over'
1570             );
1571 53         120 return 1; # and ignore it
1572             }
1573             }
1574              
1575             sub _ponder_item {
1576 53     1378   129 my ($self,$para,$curr_open,$paras) = @_;
1577 53         265 my $over;
1578 53 0 0     138 unless(@$curr_open and
1579 0         0 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580             $self->whine(
1581 0         0 $para->[1]{'start_line'},
1582             "'=item' outside of any '=over'"
1583             );
1584             unshift @$paras,
1585 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586             $para
1587             ;
1588 0         0 return 1;
1589             }
1590              
1591              
1592 0         0 my $over_type = $over->[1]{'~type'};
1593              
1594 0 0       0 if(!$over_type) {
    0          
    0          
    0          
    0          
1595             # Shouldn't happen1
1596             die "Typeless over in stack, starting at line "
1597 53         99 . $over->[1]{'start_line'};
1598              
1599             } elsif($over_type eq 'block') {
1600 53 0       201 unless($curr_open->[-1][1]{'~bitched_about'}) {
1601 53         277 $curr_open->[-1][1]{'~bitched_about'} = 1;
1602             $self->whine(
1603             $curr_open->[-1][1]{'start_line'},
1604             "You can't have =items (as at line "
1605 53         206 . $para->[1]{'start_line'}
1606             . ") unless the first thing after the =over is an =item"
1607             );
1608             }
1609             # Just turn it into a paragraph and reconsider it
1610 53         209 $para->[0] = '~Para';
1611 53         185 unshift @$paras, $para;
1612 53         169 return 1;
1613              
1614             } elsif($over_type eq 'text') {
1615 0         0 my $item_type = $self->_get_item_type($para);
1616             # That kills the content of the item if it's a number or bullet.
1617 53         266 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618              
1619 53 0 0     73 if($item_type eq 'text') {
    0          
1620             # Nothing special needs doing for 'text'
1621             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622             $self->whine(
1623 53         161 $para->[1]{'start_line'},
1624             "Expected text after =item, not a $item_type"
1625             );
1626             # Undo our clobbering:
1627 53         227 push @$para, $para->[1]{'~orig_content'};
1628 0         0 delete $para->[1]{'number'};
1629             # Only a PROPER item-number element is allowed
1630             # to have a number attribute.
1631             } else {
1632 0         0 die "Unhandled item type $item_type"; # should never happen
1633             }
1634              
1635             # =item-text thingies don't need any assimilation, it seems.
1636              
1637             } elsif($over_type eq 'number') {
1638 0         0 my $item_type = $self->_get_item_type($para);
1639             # That kills the content of the item if it's a number or bullet.
1640 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641              
1642 0         0 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643              
1644 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
    0          
1645             # Hm, it's not numeric. Correct for this.
1646 0         0 $para->[1]{'number'} = $expected_value;
1647             $self->whine(
1648 0         0 $para->[1]{'start_line'},
1649             "Expected '=item $expected_value'"
1650             );
1651 0         0 push @$para, $para->[1]{'~orig_content'};
1652             # restore the bullet, blocking the assimilation of next para
1653              
1654             } elsif($item_type eq 'text') {
1655             # Hm, it's not numeric. Correct for this.
1656 0         0 $para->[1]{'number'} = $expected_value;
1657             $self->whine(
1658 0         0 $para->[1]{'start_line'},
1659             "Expected '=item $expected_value'"
1660             );
1661             # Text content will still be there and will block next ~Para
1662              
1663             } elsif($item_type ne 'number') {
1664 0         0 die "Unknown item type $item_type"; # should never happen
1665              
1666             } elsif($expected_value == $para->[1]{'number'}) {
1667 0         0 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668              
1669             } else {
1670 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671             " instead of the expected value of $expected_value\n";
1672             $self->whine(
1673             $para->[1]{'start_line'},
1674 0         0 "You have '=item " . $para->[1]{'number'} .
1675             "' instead of the expected '=item $expected_value'"
1676             );
1677 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1678             }
1679              
1680 0 0       0 if(@$para == 2) {
1681             # For the cases where we /didn't/ push to @$para
1682 0 0       0 if($paras->[0][0] eq '~Para') {
1683 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1685             } else {
1686 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687 0         0 push @$para, ''; # Just so it's not contentless
1688             }
1689             }
1690              
1691              
1692             } elsif($over_type eq 'bullet') {
1693 0         0 my $item_type = $self->_get_item_type($para);
1694             # That kills the content of the item if it's a number or bullet.
1695 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696              
1697 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
1698             # as expected!
1699              
1700 0 0       0 if( $para->[1]{'~_freaky_para_hack'} ) {
1701 0         0 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702 0         0 push @$para, $para->[1]{'~_freaky_para_hack'};
1703             }
1704              
1705             } elsif($item_type eq 'number') {
1706             $self->whine(
1707 0         0 $para->[1]{'start_line'},
1708             "Expected '=item *'"
1709             );
1710 0         0 push @$para, $para->[1]{'~orig_content'};
1711             # and block assimilation of the next paragraph
1712 0         0 delete $para->[1]{'number'};
1713             # Only a PROPER item-number element is allowed
1714             # to have a number attribute.
1715             } elsif($item_type eq 'text') {
1716             $self->whine(
1717 0         0 $para->[1]{'start_line'},
1718             "Expected '=item *'"
1719             );
1720             # But doesn't need processing. But it'll block assimilation
1721             # of the next para.
1722             } else {
1723 0         0 die "Unhandled item type $item_type"; # should never happen
1724             }
1725              
1726 0 0       0 if(@$para == 2) {
1727             # For the cases where we /didn't/ push to @$para
1728 0 0       0 if($paras->[0][0] eq '~Para') {
1729 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1731             } else {
1732 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733 0         0 push @$para, ''; # Just so it's not contentless
1734             }
1735             }
1736              
1737             } else {
1738 0         0 die "Unhandled =over type \"$over_type\"?";
1739             # Shouldn't happen!
1740             }
1741 0         0 $para->[0] .= '-' . $over_type;
1742              
1743 0         0 return;
1744             }
1745              
1746             sub _ponder_Plain {
1747 3743     3745   6769 my ($self,$para) = @_;
1748 3743         4748 DEBUG and print STDERR " giving plain treatment...\n";
1749 3743 100 100     21152 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
      66        
      100        
1750             or $para->[1]{'~cooked'}
1751             ) {
1752             push @$para,
1753 3694         5308 @{$self->_make_treelet(
1754             join("\n", splice(@$para, 2)),
1755 3694         17549 $para->[1]{'start_line'}
1756             )};
1757             }
1758             # Empty paragraphs don't need a treelet for any reason I can see.
1759             # And precooked paragraphs already have a treelet.
1760 3743         7724 return;
1761             }
1762              
1763             sub _ponder_Verbatim {
1764 481     551   1078 my ($self,$para) = @_;
1765 481         620 DEBUG and print STDERR " giving verbatim treatment...\n";
1766              
1767 481         1162 $para->[1]{'xml:space'} = 'preserve';
1768              
1769 481 100       1281 unless ($self->{'_output_is_for_JustPod'}) {
1770             # Fix illegal settings for expand_verbatim_tabs()
1771             # This is because this module doesn't do input error checking, but khw
1772             # doesn't want to add yet another instance of that.
1773 400 100 100     1508 $self->expand_verbatim_tabs(8)
1774             if ! defined $self->expand_verbatim_tabs()
1775             || $self->expand_verbatim_tabs() =~ /\D/;
1776              
1777 400         1227 my $indent = $self->strip_verbatim_indent;
1778 400 100 100     1076 if ($indent && ref $indent eq 'CODE') {
1779 10         11 my @shifted = (shift @{$para}, shift @{$para});
  10         16  
  10         17  
1780 10         26 $indent = $indent->($para);
1781 10         57 unshift @{$para}, @shifted;
  10         27  
1782             }
1783              
1784 400         1103 for(my $i = 2; $i < @$para; $i++) {
1785 1756         3088 foreach my $line ($para->[$i]) { # just for aliasing
1786             # Strip indentation.
1787 1756 100       2922 $line =~ s/^\Q$indent// if $indent;
1788 1756 100       3137 next unless $self->expand_verbatim_tabs;
1789              
1790             # This is commented out because of github issue #85, and the
1791             # current maintainers don't know why it was there in the first
1792             # place.
1793             #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1794 1755         5764 while( $line =~
1795             # Sort of adapted from Text::Tabs.
1796 987         2253 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1797             * $self->expand_verbatim_tabs)
1798             -(length($1)&7)))/e
1799             ) {}
1800              
1801             # TODO: whinge about (or otherwise treat) unindented or overlong lines
1802              
1803             }
1804             }
1805             }
1806              
1807             # Now the VerbatimFormatted hoodoo...
1808 1284 100 66     3250 if( $self->{'accept_codes'} and
    100          
1809             $self->{'accept_codes'}{'VerbatimFormatted'}
1810             ) {
1811 868   100     5187 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
  773         1276  
1812             # Kill any number of terminal newlines
1813 812         3935 $self->_verbatim_format($para);
1814             } elsif ($self->{'codes_in_verbatim'}) {
1815             push @$para,
1816 805         1670 @{$self->_make_treelet(
1817             join("\n", splice(@$para, 2)),
1818 53         120 $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1819             )};
1820 53         73 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1821             } else {
1822 465 100       2092 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1823 465         2497 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1824             }
1825 532         1113 return;
1826             }
1827              
1828             sub _ponder_Data {
1829 73     95   242 my ($self,$para) = @_;
1830 73         165 DEBUG and print STDERR " giving data treatment...\n";
1831 22         49 $para->[1]{'xml:space'} = 'preserve';
1832 22 100       72 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1833 22         41 return;
1834             }
1835              
1836              
1837              
1838              
1839             ###########################################################################
1840              
1841             sub _traverse_treelet_bit { # for use only by the routine above
1842 7015     7266   13968 my($self, $name) = splice @_,0,2;
1843              
1844 7015         8711 my $scratch;
1845 7015         20498 $self->_handle_element_start(($scratch=$name), shift @_);
1846              
1847 7066         14275 while (@_) {
1848 12044         17167 my $x = shift;
1849 12044 100       18740 if (ref($x)) {
1850 3093         6221 &_traverse_treelet_bit($self, @$x);
1851             } else {
1852 9269   100     23299 $x .= shift while @_ && !ref($_[0]);
1853 8953         17125 $self->_handle_text($x);
1854             }
1855             }
1856              
1857 7066         18830 $self->_handle_element_end($scratch=$name);
1858 7052         19025 return;
1859             }
1860              
1861             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1862              
1863             sub _closers_for_all_curr_open {
1864 55     104   188 my $self = $_[0];
1865 43         118 my @closers;
1866 6 50       10 foreach my $still_open (@{ $self->{'curr_open'} || return }) {
  6         34  
1867 9         20 my @copy = @$still_open;
1868 23         102 $copy[1] = {%{ $copy[1] }};
  23         118  
1869             #$copy[1]{'start_line'} = -1;
1870 60 100       120 if($copy[0] eq '=for') {
    50          
1871 35         75 $copy[0] = '=end';
1872             } elsif($copy[0] eq '=over') {
1873             $self->whine(
1874             $still_open->[1]{start_line} ,
1875 38         74 "=over without closing =back"
1876             );
1877              
1878 38         70 $copy[0] = '=back';
1879             } else {
1880 32         90 die "I don't know how to auto-close an open $copy[0] region";
1881             }
1882              
1883 41 50       85 unless( @copy > 2 ) {
1884 1001         2543 push @copy, $copy[1]{'target'};
1885 1001 100       1434 $copy[-1] = '' unless defined $copy[-1];
1886             # since =over's don't have targets
1887             }
1888              
1889 1001         3475 $copy[1]{'fake-closer'} = 1;
1890              
1891 1001         2007 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1892 1126         1838 unshift @closers, \@copy;
1893             }
1894 1123         2099 return @closers;
1895             }
1896              
1897             #--------------------------------------------------------------------------
1898              
1899             sub _verbatim_format {
1900 171     122   416 my($it, $p) = @_;
1901              
1902 1076         2828 my $formatting;
1903              
1904 1076         2602 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1905 1340         3525 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1906 1340         3153 $p->[$i] .= "\n";
1907             # Unlike with simple Verbatim blocks, we don't end up just doing
1908             # a join("\n", ...) on the contents, so we have to append a
1909             # newline to every line, and then nix the last one later.
1910             }
1911              
1912 69         79 if( DEBUG > 4 ) {
1913             print STDERR "<<\n";
1914             for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1915             print STDERR "_verbatim_format $i: $p->[$i]";
1916             }
1917             print STDERR ">>\n";
1918             }
1919              
1920 69         171 for(my $i = $#$p; $i > 2; $i--) {
1921             # work backwards over the lines, except the first (#2)
1922              
1923             #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1924             # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1925             # look at a formatty line preceding a nonformatty one
1926 286         325 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1927 286 100       479 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1928 17         27 DEBUG > 5 and print STDERR " It's a formatty line. ",
1929             "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1930              
1931 17 100       34 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1932 7         28 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1933 7         21 next;
1934             } else {
1935 16         25 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1936             }
1937             } else {
1938 271         292 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1939 271         495 next;
1940             }
1941              
1942             # A formatty line has to have #: in the first two columns, and uses
1943             # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1944             # Example:
1945             # What do you want? i like pie. [or whatever]
1946             # #:^^^^^^^^^^^^^^^^^ /////////////
1947              
1948              
1949 10         10 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1950              
1951 16         38 $formatting = ' ' . $1;
1952 16         54 $formatting =~ s/\s+$//s; # nix trailing whitespace
1953 16 50 33     56 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1954 6         13 splice @$p,$i,1; # remove this line
1955 6         15 $i--; # don't consider next line
1956 6         19 next;
1957             }
1958              
1959 14 100       28 if( length($formatting) >= length($p->[$i-1]) ) {
1960 40         76 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1961             } else {
1962 44         67 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1963             }
1964             # Make $formatting and the previous line be exactly the same length,
1965             # with $formatting having a " " as the last character.
1966              
1967 47         105 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1968              
1969              
1970 220         270 my @new_line;
1971 220         569 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1972             #print STDERR "Format matches $1\n";
1973              
1974 91 100       156 if($2) {
1975             #print STDERR "SKIPPING <$2>\n";
1976 69         215 push @new_line,
1977             substr($p->[$i-1], pos($formatting)-length($1), length($1));
1978             } else {
1979             #print STDERR "SNARING $+\n";
1980 195 50       322 push @new_line, [
    100          
    100          
1981             (
1982             $3 ? 'VerbatimB' :
1983             $4 ? 'VerbatimI' :
1984             $5 ? 'VerbatimBI' : die("Should never get called")
1985             ), {},
1986             substr($p->[$i-1], pos($formatting)-length($1), length($1))
1987             ];
1988             #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1989             }
1990             }
1991 183         388 my @nixed =
1992             splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1993 10         23 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1994              
1995 10         13 DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1996             scalar(@new_line), "):",
1997             map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1998 10         25 $i--; # So the next line we scrutinize is the line before the one
1999             # that we just went and formatted
2000             }
2001              
2002 65         126 $p->[0] = 'VerbatimFormatted';
2003              
2004             # Collapse adjacent text nodes, just for kicks.
2005 65         145 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2006 173 0 0     198 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2007 173         286 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2008 0         0 $p->[$i] .= splice @$p, $i+1, 1; # merge
2009 0         0 --$i; # and back up
2010             }
2011             }
2012              
2013             # Now look for the last text token, and remove the terminal newline
2014 65         156 for( my $i = $#$p; $i >= 2; $i-- ) {
2015             # work backwards over the tokens, even the first
2016 65 50       129 if( !ref($p->[$i]) ) {
2017 65 50       275 if($p->[$i] =~ s/\n$//s) {
2018 65         89 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2019             } else {
2020 0         0 DEBUG > 5 and print STDERR
2021             "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2022             }
2023 65         122 last; # we only want the next one
2024             }
2025             }
2026              
2027 65         113 return;
2028             }
2029              
2030              
2031             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2032              
2033              
2034             sub _treelet_from_formatting_codes {
2035             # Given a paragraph, returns a treelet. Full of scary tokenizing code.
2036             # Like [ '~Top', {'start_line' => $start_line},
2037             # "I like ",
2038             # [ 'B', {}, "pie" ],
2039             # "!"
2040             # ]
2041             # This illustrates the general format of a treelet. It is an array:
2042             # [0] is a scalar indicating its type. In the example above, the
2043             # types are '~Top' and 'B'
2044             # [1] is a hash of various flags about it, possibly empty
2045             # [2] - [N] are an ordered list of the subcomponents of the treelet.
2046             # Scalars are literal text, refs are sub-treelets, to
2047             # arbitrary levels. Stringifying a treelet will recursively
2048             # stringify the sub-treelets, concatentating everything
2049             # together to form the exact text of the treelet.
2050              
2051 3696     3749   7686 my($self, $para, $start_line, $preserve_space) = @_;
2052              
2053 3696         10995 my $treelet = ['~Top', {'start_line' => $start_line},];
2054              
2055 3696 100 100     13351 unless ($preserve_space || $self->{'preserve_whitespace'}) {
2056 2829         24783 $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2057 2829         5413 $para =~ s/ $//;
2058 2829         4205 $para =~ s/^ //;
2059             }
2060              
2061             # Only apparent problem the above code is that N<< >> turns into
2062             # N<< >>. But then, word wrapping does that too! So don't do that!
2063              
2064              
2065             # As a Start-code is encountered, the number of opening bracket '<'
2066             # characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2067             # etc). When closing brackets are found in the text, at least this number
2068             # (plus the 1) will be required to mean the Start-code is terminated. When
2069             # those are found, @stack is popped.
2070 3696         5216 my @stack;
2071              
2072 3696         6530 my @lineage = ($treelet);
2073 3696         5106 my $raw = ''; # raw content of L<> fcode before splitting/processing
2074             # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2075             # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2076             # the 'collapse and trim all whitespace first' lines just above.
2077 3696         4538 my $inL = 0;
2078              
2079 3696         4446 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2080              
2081             # Here begins our frightening tokenizer RE. The following regex matches
2082             # text in four main parts:
2083             #
2084             # * Start-codes. The first alternative matches C< or C<<, the latter
2085             # followed by some whitespace. $1 will hold the entire start code
2086             # (including any space following a multiple-angle-bracket delimiter),
2087             # and $2 will hold only the additional brackets past the first in a
2088             # multiple-bracket delimiter. length($2) + 1 will be the number of
2089             # closing brackets we have to find.
2090             #
2091             # * Closing brackets. Match some amount of whitespace followed by
2092             # multiple close brackets. The logic to see if this closes anything
2093             # is down below. Note that in order to parse C<< >> correctly, we
2094             # have to use look-behind (?<=\s\s), since the match of the starting
2095             # code will have consumed the whitespace.
2096             #
2097             # * A single closing bracket, to close a simple code like C<>.
2098             #
2099             # * Something that isn't a start or end code. We have to be careful
2100             # about accepting whitespace, since perlpodspec says that any whitespace
2101             # before a multiple-bracket closing delimiter should be ignored.
2102             #
2103 3733         36604 while($para =~
2104             m/\G
2105             (?:
2106             # Match starting codes, including the whitespace following a
2107             # multiple-delimiter start code. $1 gets the whole start code and
2108             # $2 gets all but one of the
2109             ([A-Z]<(?:(<+)\s+)?)
2110             |
2111             # Match multiple-bracket end codes. $3 gets the whitespace that
2112             # should be discarded before an end bracket but kept in other cases
2113             # and $4 gets the end brackets themselves. ($3 can be empty if the
2114             # construct is empty, like C<< >>, and all the white-space has been
2115             # gobbled up already, considered to be space after the opening
2116             # bracket. In this case we use look-behind to verify that there are
2117             # at least 2 spaces in a row before the ">".)
2118             (\s+|(?<=\s\s))(>{2,})
2119             |
2120             (\s?>) # $5: simple end-codes
2121             |
2122             ( # $6: stuff containing no start-codes or end-codes
2123             (?:
2124             [^A-Z\s>]
2125             |
2126             (?:
2127             [A-Z](?!<)
2128             )
2129             |
2130             # whitespace is ok, but we don't want to eat the whitespace before
2131             # a multiple-bracket end code.
2132             # NOTE: we may still have problems with e.g. S<< >>
2133             (?:
2134             \s(?!\s*>{2,})
2135             )
2136             )+
2137             )
2138             )
2139             /xgo
2140             ) {
2141 14886         19371 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2142 14849 100       42357 if(defined $1) {
    100          
    100          
    50          
2143 2990         3686 my $bracket_count; # How many '<<<' in a row this has. Needed for
2144             # Pod::Simple::JustPod
2145 2990 100       5413 if(defined $2) {
2146 114         257 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2147 151         362 $bracket_count = length($2) + 1;
2148 151         367 push @stack, $bracket_count; # length of the necessary complex
2149             # end-code string
2150             } else {
2151 2913         3624 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2152 2913         4265 push @stack, 0; # signal that we're looking for simple
2153 2876         4007 $bracket_count = 1;
2154             }
2155 3027         6192 my $code = substr($1,0,1);
2156 3027 100       5353 if ('L' eq $code) {
2157 1180 100       2590 if ($inL) {
2158 748         2254 $raw .= $1;
2159 748         2800 $self->scream( $start_line,
2160             'Nested L<> are illegal. Pretending inner one is '
2161             . 'X<...> so can continue looking for other errors.');
2162 748         4045 $code = "X";
2163             }
2164             else {
2165 1179         2434 $raw = ""; # reset raw content accumulator
2166 1179         1976 $inL = @stack;
2167             }
2168             } else {
2169 3304 100       5548 $raw .= $1 if $inL;
2170             }
2171 3737         8805 push @lineage, [ $code, {}, ]; # new node object
2172              
2173             # Tell Pod::Simple::JustPod how many brackets there were, but to save
2174             # space, not in the most usual case of there was just 1. It can be
2175             # inferred by the absence of this element. Similarly, if there is more
2176             # than one bracket, extract the white space between the final bracket
2177             # and the real beginning of the interior. Save that if it isn't just a
2178             # single space
2179 3737 100 100     9039 if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2180 765         1079 $lineage[-1][1]{'~bracket_count'} = $bracket_count;
2181 765         1004 my $lspacer = substr($1, 1 + $bracket_count);
2182 765 100       6738 $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2183             }
2184 4023         5480 push @{ $lineage[-2] }, $lineage[-1];
  4023         20595  
2185             } elsif(defined $4) {
2186 210         313 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2187             # This is where it gets messy...
2188 210 100       721 if(! @stack) {
    100          
    100          
    50          
2189             # We saw " >>>>" but needed nothing. This is ALL just stuff then.
2190 2         88 DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2191 2         11 push @{ $lineage[-1] }, $3, $4;
  2         12  
2192 88         132 next;
2193             } elsif(!$stack[-1]) {
2194             # We saw " >>>>" but needed only ">". Back pos up.
2195 87         140 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2196 87         127 push @{ $lineage[-1] }, $3; # That was a for-real space, too.
  88         206  
2197 88         163 pos($para) = pos($para) - length($4) + 1;
2198             } elsif($stack[-1] == length($4)) {
2199             # We found " >>>>", and it was exactly what we needed. Commonest case.
2200 114         214 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2201             } elsif($stack[-1] < length($4)) {
2202             # We saw " >>>>" but needed only " >>". Back pos up.
2203 0         0 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2204 0         0 pos($para) = pos($para) - length($4) + $stack[-1];
2205             } else {
2206             # We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2207 7         16 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2208 7         15 push @{ $lineage[-1] }, $3, $4;
  7         29  
2209 95         214 next;
2210             }
2211             #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2212              
2213 202 50 66     557 if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2214 91 100       325 if ($3 ne "") {
    100          
2215 1         4 $lineage[-1][1]{'~rspacer'} = $3;
2216             }
2217             elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2218              
2219             # Here we had something like C<< >> which was a false positive
2220 1         5 delete $lineage[-1][1]{'~lspacer'};
2221             }
2222             else {
2223             $lineage[-1][1]{'~rspacer'}
2224 1         5 = substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2225 89         138 chop $lineage[-1][1]{'~lspacer'};
2226             }
2227             }
2228              
2229 202 100       725 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  6         22  
  118         389  
2230             # Keep the element from being childless
2231              
2232 114 100       266 if ($inL == @stack) {
2233 22         63 $lineage[-1][1]{'raw'} = $raw;
2234 22         31 $inL = 0;
2235             }
2236              
2237 114         185 pop @stack;
2238 117         171 pop @lineage;
2239              
2240 117 100       663 $raw .= $3.$4 if $inL;
2241              
2242             } elsif(defined $5) {
2243 2974         3456 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2244              
2245 2974 100 100     9533 if(@stack and ! $stack[-1]) {
2246             # We're indeed expecting a simple end-code
2247 2869         3765 DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2248              
2249 2868 50       6010 if(length($5) == 2) { # There was a space there: " >"
    100          
2250 0         0 push @{ $lineage[-1] }, ' ';
  0         0  
2251 2868         5956 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2252 36         54 push @{ $lineage[-1] }, ''; # keep it from being really childless
  36         94  
2253             }
2254              
2255 2872 100       5753 if ($inL == @stack) {
2256 410         1175 $lineage[-1][1]{'raw'} = $raw;
2257 410         596 $inL = 0;
2258             }
2259              
2260 2868         3847 pop @stack;
2261 2868         3884 pop @lineage;
2262             } else {
2263 103         160 DEBUG > 4 and print STDERR " It's just stuff.\n";
2264 107         160 push @{ $lineage[-1] }, $5;
  103         254  
2265             }
2266              
2267 2975 100       20349 $raw .= $5 if $inL;
2268              
2269             } elsif(defined $6) {
2270 8770         10028 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2271 8766         10080 push @{ $lineage[-1] }, $6;
  8766         21439  
2272 8770 100       31202 $raw .= $6 if $inL;
2273             # XXX does not capture multiplace whitespaces -- 'raw' ends up with
2274             # at most 1 leading/trailing whitespace, why not all of it?
2275             # Answer, because we deliberately trimmed it above
2276              
2277             } else {
2278             # should never ever ever ever happen
2279 4         6 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2280 4         24 die "SPORK 512512!";
2281             }
2282             }
2283              
2284 3781 100       7531 if(@stack) { # Uhoh, some sequences weren't closed.
2285 93         347 my $x= "...";
2286 92         136 while(@stack) {
2287 92 50       197 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  0         0  
  8         21  
2288             # Hmmmmm!
2289              
2290 92         203 my $code = (pop @lineage)->[0];
2291 13         24 my $ender_length = pop @stack;
2292 13 50       30 if($ender_length) {
2293 84         179 --$ender_length;
2294 0         0 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2295             } else {
2296 8         37 $x = $code . "<$x>";
2297             }
2298             }
2299 92         132 DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2300 92         164 $self->whine($start_line,
2301             "Unterminated $x sequence",
2302             );
2303             }
2304              
2305 3697         10708 return $treelet;
2306             }
2307              
2308             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309              
2310             sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2311 1     0 0 2 return stringify_lol($_[1]);
2312             }
2313              
2314             sub stringify_lol { # function: stringify_lol($lol)
2315 2607     3409 0 4065 my $string_form = '';
2316 2691         5881 _stringify_lol( $_[0] => \$string_form );
2317 3462         8379 return $string_form;
2318             }
2319              
2320             sub _stringify_lol { # the real recursor
2321 3842     3037   5811 my($lol, $to) = @_;
2322 3842         8510 for(my $i = 2; $i < @$lol; ++$i) {
2323 4840 100 100     12403 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
      66        
2324 380         673 _stringify_lol( $lol->[$i], $to); # recurse!
2325             } else {
2326 3604         7810 $$to .= $lol->[$i];
2327             }
2328             }
2329 3733         6213 return;
2330             }
2331              
2332             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2333              
2334             sub _dump_curr_open { # return a string representation of the stack
2335 0     32   0 my $curr_open = $_[0]{'curr_open'};
2336              
2337 0 50       0 return '[empty]' unless @$curr_open;
2338             return join '; ',
2339 0         0 map {;
2340             ($_->[0] eq '=for')
2341             ? ( ($_->[1]{'~really'} || '=over')
2342 0 50 50     0 . ' ' . $_->[1]{'target'})
2343             : $_->[0]
2344             }
2345             @$curr_open
2346             ;
2347             }
2348              
2349             ###########################################################################
2350             my %pretty_form = (
2351             "\a" => '\a', # ding!
2352             "\b" => '\b', # BS
2353             "\e" => '\e', # ESC
2354             "\f" => '\f', # FF
2355             "\t" => '\t', # tab
2356             "\cm" => '\cm',
2357             "\cj" => '\cj',
2358             "\n" => '\n', # probably overrides one of either \cm or \cj
2359             '"' => '\"',
2360             '\\' => '\\\\',
2361             '$' => '\\$',
2362             '@' => '\\@',
2363             '%' => '\\%',
2364             '#' => '\\#',
2365             );
2366              
2367             sub pretty { # adopted from Class::Classless
2368             # Not the most brilliant routine, but passable.
2369             # Don't give it a cyclic data structure!
2370 502     1494 0 2251 my @stuff = @_; # copy
2371 502         617 my $x;
2372             my $out =
2373             # join ",\n" .
2374             join ", ",
2375 502         733 map {;
2376 592 50 100     3985 if(!defined($_)) {
    100 66        
    50 33        
    100          
    100          
    100          
2377 0         0 "undef";
2378             } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2379 58         142 $x = "[ " . pretty(@$_) . " ]" ;
2380 58         128 $x;
2381             } elsif(ref($_) eq 'SCALAR') {
2382 0         0 $x = "\\" . pretty($$_) ;
2383 0         0 $x;
2384             } elsif(ref($_) eq 'HASH') {
2385 795         2569 my $hr = $_;
2386             $x = "{" . join(", ",
2387 48         446 map(pretty($_) . '=>' . pretty($hr->{$_}),
2388             sort keys %$hr ) ) . "}" ;
2389 48         171 $x;
2390 2         5 } elsif(!length($_)) { q{''} # empty string
2391             } elsif(
2392             $_ eq '0' # very common case
2393             or(
2394             m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2395             and $_ ne '-0' # the strange case that RE lets thru
2396             )
2397 26         83 ) { $_;
2398             } else {
2399             # Yes, explicitly name every character desired. There are shorcuts one
2400             # could make, but I (Karl Williamson) was afraid that some Perl
2401             # releases would have bugs in some of them. For example [A-Z] works
2402             # even on EBCDIC platforms to match exactly the 26 uppercase English
2403             # letters, but I don't know if it has always worked without bugs. It
2404             # seemed safest just to list the characters.
2405             # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2406 458 0       909 s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
  0         0  
2407             <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2408 458         1318 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2409             qq{"$_"};
2410             }
2411             } @stuff;
2412 502         2234 # $out =~ s/\n */ /g if length($out) < 75;
2413             return $out;
2414             }
2415              
2416             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2417              
2418             # A rather unsubtle method of blowing away all the state information
2419             # from a parser object so it can be reused. Provided as a utility for
2420             # backward compatibility in Pod::Man, etc. but not recommended for
2421             # general use.
2422              
2423 5     9 0 9042 sub reinit {
2424 5         28 my $self = shift;
2425             foreach (qw(source_dead source_filename doc_has_started
2426             start_of_pod_block content_seen last_was_blank paras curr_open
2427             line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2428             Title)) {
2429 83         232  
2430             delete $self->{$_};
2431             }
2432             }
2433              
2434             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2435             1;
2436