File Coverage

blib/lib/Pod/Simple/BlackBox.pm
Criterion Covered Total %
statement 854 1006 84.8
branch 368 526 69.9
condition 132 189 69.8
subroutine 59 65 90.7
pod 0 7 0.0
total 1413 1793 78.8


line stmt bran cond sub pod time code
1             package Pod::Simple::BlackBox;
2             #
3             # "What's in the box?" "Pain."
4             #
5             ###########################################################################
6             #
7             # This is where all the scary things happen: parsing lines into
8             # paragraphs; and then into directives, verbatims, and then also
9             # turning formatting sequences into treelets.
10             #
11             # Are you really sure you want to read this code?
12             #
13             #-----------------------------------------------------------------------------
14             #
15             # The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16             # of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17             # to call the proper callbacks on the treelets.
18             #
19             # Every node in a treelet is a ['name', {attrhash}, ...children...]
20              
21 69     69   2300 use integer; # vroom!
  69         137  
  69         477  
22 69     69   1969 use strict;
  69         123  
  69         1165  
23 69     69   285 use Carp ();
  69         297  
  69         1475  
24 69     69   298 use vars qw($VERSION );
  69         112  
  69         13240  
25             $VERSION = '3.42';
26             #use constant DEBUG => 7;
27              
28             sub my_qr ($$) {
29              
30             # $1 is a pattern to compile and return. Older perls compile any
31             # syntactically valid property, even if it isn't legal. To cope with
32             # this, return an empty string unless the compiled pattern also
33             # successfully matches $2, which the caller furnishes.
34              
35 450     450 0 1196 my ($input_re, $should_match) = @_;
36             # XXX could have a third parameter $shouldnt_match for extra safety
37              
38 450 50       3177 my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39              
40 67     67   1173 my $re = eval "no warnings; $use_utf8 qr/$input_re/";
  67     67   153  
  67     67   3190  
  67     67   940  
  67     67   196  
  67     67   3997  
  67     67   41336  
  67     67   2237  
  67     67   1010  
  67     67   1583  
  67     37   190  
  67     0   2180  
  67     0   411  
  67     0   115  
  67         2394  
  67         450  
  67         127  
  67         2156  
  67         399  
  67         144  
  67         2266  
  67         381  
  67         121  
  67         2127  
  67         458  
  67         130  
  67         2046  
  67         539  
  67         139  
  67         3909  
  450         24847  
  3         7  
  3         6  
  4         32  
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
41             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
42 450 50       1789 return "" if $@;
43              
44 67     67   6362 my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
  67     67   136  
  67     67   2204  
  67     67   581  
  67     67   127  
  67     67   3649  
  67     67   426  
  67     67   128  
  67     67   2276  
  67     747   378  
  67     0   120  
  67     3   2135  
  67     0   420  
  67         123  
  67         2512  
  67         375  
  67         118  
  67         10279  
  67         407  
  67         161  
  67         2753  
  67         394  
  67         127  
  67         2231  
  67         386  
  67         136  
  67         2721  
  450         23237  
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
45             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
46 450 50       1765 return "" if $@;
47              
48             #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49 450 50       1507 return $re if $matches;
50              
51             #print STDERR __LINE__, ": $re: didn't match\n";
52 1         2 return "";
53             }
54              
55             BEGIN {
56 69     69   1484 require Pod::Simple;
57 69 50       5341 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58             }
59              
60             # Matches a character iff the character will have a different meaning
61             # if we choose CP1252 vs UTF-8 if there is no =encoding line.
62             # This is broken for early Perls on non-ASCII platforms.
63             my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64             $non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65              
66             # Use patterns understandable by Perl 5.6, if possible
67 69     69   482 my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
  69         145  
  69         61971  
68             my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69             # to get assigned
70             my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71             "\x{250}");
72             $rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73              
74 67     67   482 my $script_run_re = eval 'no warnings "experimental::script_run";
  67         124  
  67         14669  
75             qr/(*script_run: ^ .* $ )/x';
76             my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77             unless ($latin_re) {
78             # This was machine generated to be the ranges of the union of the above
79             # three properties, with things that were undefined by Unicode 4.1 filling
80             # gaps. That is the version in use when Perl advanced enough to
81             # successfully compile and execute the above pattern.
82             $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83             }
84              
85             my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86              
87             # Latin script code points not in the first release of Unicode
88             my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89              
90             # If this perl doesn't have the Deprecated property, there's only one code
91             # point in it that we need be concerned with.
92             my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93             $deprecated_re = qr/\x{149}/ unless $deprecated_re;
94              
95             my $utf8_bom;
96             if (($] ge 5.007_003)) {
97             $utf8_bom = "\x{FEFF}";
98             utf8::encode($utf8_bom);
99             } else {
100             $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101             }
102              
103             # This is used so that the 'content_seen' method doesn't return true on a
104             # file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105             # there is a valid =foo line will we return that content was seen.
106             my $seen_legal_directive = 0;
107              
108             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109              
110 1     1 0 43 sub parse_line { shift->parse_lines(@_) } # alias
111              
112             # - - - Turn back now! Run away! - - -
113              
114             sub parse_lines { # Usage: $parser->parse_lines(@lines)
115             # an undef means end-of-stream
116 7636     7636 0 11107 my $self = shift;
117              
118 7636         11017 my $code_handler = $self->{'code_handler'};
119 7636         10494 my $cut_handler = $self->{'cut_handler'};
120 7636         9610 my $wl_handler = $self->{'whiteline_handler'};
121 7636   100     14580 $self->{'line_count'} ||= 0;
122              
123 7636         9447 my $scratch;
124              
125             DEBUG > 4 and
126 7636         8841 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127              
128 7636         8297 DEBUG > 5 and
129             print STDERR "# About to parse lines: ",
130             join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131              
132 7636   100     15455 my $paras = ($self->{'paras'} ||= []);
133             # paragraph buffer. Because we need to defer processing of =over
134             # directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135             # to process this.
136              
137 7636   100     14814 $self->{'pod_para_count'} ||= 0;
138              
139             # An attempt to match the pod portions of a line. This is not fool proof,
140             # but is good enough to serve as part of the heuristic for guessing the pod
141             # encoding if not specified.
142 7636         9515 my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
  73544         141937  
  7636         38502  
143 7636         35142 my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x;
144              
145 7636         11204 my $line;
146 7801         12918 foreach my $source_line (@_) {
147 16149 50       28357 if( $self->{'source_dead'} ) {
148 166         9788 DEBUG > 4 and print STDERR "# Source is dead.\n";
149 166         632 last;
150             }
151              
152 16149 100       39613 unless( defined $source_line ) {
153 833         1443 DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154              
155 833         2537 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156 667         1524 push @$paras, $paras->[-1], $paras->[-1];
157             # So that it definitely fills the buffer.
158 667         1184 $self->{'source_dead'} = 1;
159 2503         4189 $self->_ponder_paragraph_buffer;
160 2503         3686 next;
161             }
162              
163              
164 17152 100       26782 if( $self->{'line_count'}++ ) {
165 16483         28907 ($line = $source_line) =~ tr/\n\r//d;
166             # If we don't have two vars, we'll end up with that there
167             # tr/// modding the (potentially read-only) original source line!
168              
169             } else {
170 2505         4283 DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171              
172 2505 50       5898 if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
    50          
    50          
173 1836         1950 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174 1836         2053 $self->_handle_encoding_line( "=encoding utf8" );
175 1836         3440 delete $self->{'_processed_encoding'};
176 1836         3423 $line =~ tr/\n\r//d;
177              
178             } elsif( $line =~ s/^\xFE\xFF//s ) {
179 1836         2180 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180             $self->scream(
181 17138         31961 $self->{'line_count'},
182             "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183             );
184 1836         8522 splice @_;
185 1836         9187 push @_, undef;
186 1836         2584 next;
187              
188             # TODO: implement somehow?
189              
190             } elsif( $line =~ s/^\xFF\xFE//s ) {
191 1836         2796 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192             $self->scream(
193 3035         5680 $self->{'line_count'},
194             "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195             );
196 0         0 splice @_;
197 0         0 push @_, undef;
198 3035         5687 next;
199              
200             # TODO: implement somehow?
201              
202             } else {
203 890         1260 DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204 890         2396 ($line = $source_line) =~ tr/\n\r//d;
205             }
206             }
207              
208 15537 100 100     100585 if(!$self->{'parse_characters'} && !$self->{'encoding'}
      100        
      100        
      100        
209             && ($self->{'in_pod'} || $line =~ /^=/s)
210             && $line =~ /$non_ascii_re/
211             ) {
212              
213 231         391 my $encoding;
214              
215             # No =encoding line, and we are at the first pod line in the input that
216             # contains a non-ascii byte, that is, one whose meaning varies depending
217             # on whether the file is encoded in UTF-8 or CP1252, which are the two
218             # possibilities permitted by the pod spec. (ASCII is assumed if the
219             # file only contains ASCII bytes.) In order to process this line, we
220             # need to figure out what encoding we will use for the file.
221             #
222             # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223             # 160-255, but it is used here, as it often colloquially is, to refer to
224             # the complete set of code points 0-255, including ASCII (0-127), the C1
225             # controls (128-159), and strict Latin 1 (160-255).
226             #
227             # CP1252 is effectively a superset of Latin 1, because it differs only
228             # from colloquial 8859-1 in the C1 controls, which are very unlikely to
229             # actually be present in 8859-1 files, so can be used for other purposes
230             # without conflict. CP 1252 uses most of them for graphic characters.
231             #
232             # Note that all ASCII-range bytes represent their corresponding code
233             # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234             # code points require multiple (non-ASCII) bytes to represent. (A
235             # separate paragraph for EBCDIC is below.) The multi-byte
236             # representation is quite structured. If we find an isolated byte that
237             # would require multiple bytes to represent in UTF-8, we know that the
238             # encoding is not UTF-8. If we find a sequence of bytes that violates
239             # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240             # hence must be 1252.
241             #
242             # But there are ambiguous cases where we could guess wrong. If so, the
243             # user will end up having to supply an =encoding line. We use all
244             # readily available information to improve our chances of guessing
245             # right. The odds of something not being UTF-8, but still passing a
246             # UTF-8 validity test go down very rapidly with increasing length of the
247             # sequence. Therefore we look at all non-ascii sequences on the line.
248             # If any of the sequences can't be UTF-8, we quit there and choose
249             # CP1252. If all could be UTF-8, we see if any of the code points
250             # represented are unlikely to be in pod. If so, we guess CP1252. If
251             # not, we check if the line is all in the same script; if not guess
252             # CP1252; otherwise UTF-8. For perls that don't have convenient script
253             # run testing, see if there is both Latin and non-Latin. If so, CP1252,
254             # otherwise UTF-8.
255             #
256             # On EBCDIC platforms, the situation is somewhat different. In
257             # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258             # but so do the bytes that are for the C1 controls. Recall that these
259             # correspond to the unused portion of 8859-1 that 1252 mostly takes
260             # over. That means that there are fewer code points that are
261             # represented by multi-bytes. But, note that the these controls are
262             # very unlikely to be in pod text. So if we encounter one of them, it
263             # means that it is quite likely CP1252 and not UTF-8. The net result is
264             # the same code below is used for both platforms.
265             #
266             # XXX probably if the line has E that evaluates to illegal CP1252,
267             # then it is UTF-8. But we haven't processed E<> yet.
268              
269 231 50       575 goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270              
271 231         365 my $copy;
272              
273 69     69   551 no warnings 'utf8';
  69         150  
  69         5190  
274              
275 2824 50       5117 if ($] ge 5.007_003) {
276 2609         5272 $copy = $line;
277              
278             # On perls that have this function, we can use it to easily see if the
279             # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280             # needed below for script run detection
281 225 100       410 goto set_1252 if ! utf8::decode($copy);
282             }
283             elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284             # code page doing here anyway?
285             goto set_utf8;
286             }
287             else { # ASCII, no decode(): do it ourselves using the fundamental
288             # characteristics of UTF-8
289 69     69   40518 use if $] le 5.006002, 'utf8';
  69         851  
  69         1467  
290              
291 215         1173 my $char_ord;
292             my $needed; # How many continuation bytes to gobble up
293              
294             # Initialize the translated line with a dummy character that will be
295             # deleted after everything else is done. This dummy makes sure that
296             # $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297             # with upgrading in the middle
298 0         0 $copy = chr(0x100);
299              
300             # Parse through the line
301 0         0 for (my $i = 0; $i < length $line; $i++) {
302 0         0 my $byte = substr($line, $i, 1);
303              
304             # ASCII bytes are trivially dealt with
305 0 0       0 if ($byte !~ $non_ascii_re) {
306 0         0 $copy .= $byte;
307 0         0 next;
308             }
309              
310 0         0 my $b_ord = ord $byte;
311              
312             # Now figure out what this code point would be if the input is
313             # actually in UTF-8. If, in the process, we discover that it isn't
314             # well-formed UTF-8, we guess CP1252.
315             #
316             # Start the process. If it is UTF-8, we are at the first, start
317             # byte, of a multi-byte sequence. We look at this byte to figure
318             # out how many continuation bytes are needed, and to initialize the
319             # code point accumulator with the data from this byte.
320             #
321             # Normally the minimum continuation byte is 0x80, but in certain
322             # instances the minimum is a higher number. So the code below
323             # overrides this for those instances.
324 0         0 my $min_cont = 0x80;
325              
326 0 0       0 if ($b_ord < 0xC2) { # A start byte < C2 is malformed
    0          
    0          
    0          
327 0         0 goto set_1252;
328             }
329             elsif ($b_ord <= 0xDF) {
330 0         0 $needed = 1;
331 0         0 $char_ord = $b_ord & 0x1F;
332             }
333             elsif ($b_ord <= 0xEF) {
334 0 0       0 $min_cont = 0xA0 if $b_ord == 0xE0;
335 0         0 $needed = 2;
336 215         286 $char_ord = $b_ord & (0x1F >> 1);
337             }
338             elsif ($b_ord <= 0xF4) {
339 215 0       528 $min_cont = 0x90 if $b_ord == 0xF0;
340 2814         15789 $needed = 3;
341 11         21 $char_ord = $b_ord & (0x1F >> 2);
342             }
343             else { # F4 is the highest start byte for legal Unicode; higher is
344             # unlikely to be in pod.
345 11         61 goto set_1252;
346             }
347              
348             # ? not enough continuation bytes available
349 11 0       20 goto set_1252 if $i + $needed >= length $line;
350              
351             # Accumulate the ordinal of the character from the remaining
352             # (continuation) bytes.
353 11         40 while ($needed-- > 0) {
354 11         20 my $cont = substr($line, ++$i, 1);
355 11         126 $b_ord = ord $cont;
356 0 0 0     0 goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF;
357              
358             # In all cases, any next continuation bytes all have the same
359             # minimum legal value
360 0         0 $min_cont = 0x80;
361              
362             # Accumulate this byte's contribution to the code point
363 0         0 $char_ord <<= 6;
364 0         0 $char_ord |= ($b_ord & 0x3F);
365             }
366              
367             # Here, the sequence that formed this code point was valid UTF-8,
368             # so add the completed character to the output
369 0         0 $copy .= chr $char_ord;
370             } # End of loop through line
371              
372             # Delete the dummy first character
373 0         0 $copy = substr($copy, 1);
374             }
375              
376             # Here, $copy is legal UTF-8.
377              
378             # If it can't be legal CP1252, no need to look further. (These bytes
379             # aren't valid in CP1252.) This test could have been placed higher in
380             # the code, but it seemed wrong to set the encoding to UTF-8 without
381             # making sure that the very first instance is well-formed. But what if
382             # it isn't legal CP1252 either? We have to choose one or the other, and
383             # It seems safer to favor the single-byte encoding over the multi-byte.
384 0 50       0 goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385              
386             # The C1 controls are not likely to appear in pod
387 0 50       0 goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388              
389             # Nor are surrogates nor unassigned, nor deprecated.
390 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391 0 50 33     0 goto set_1252 if $cs_re && $copy =~ $cs_re;
392 0 50 33     0 DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393 0 50 33     0 goto set_1252 if $cn_re && $copy =~ $cn_re;
394 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395 0 50       0 goto set_1252 if $copy =~ $deprecated_re;
396              
397             # Nor are rare code points. But this is hard to determine. khw
398             # believes that IPA characters and the modifier letters are unlikely to
399             # be in pod (and certainly very unlikely to be the in the first line in
400             # the pod containing non-ASCII)
401 0 100       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402 0 100 66     0 goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403              
404             # The first Unicode version included essentially every Latin character
405             # in modern usage. So, a Latin character not in the first release will
406             # unlikely be in pod.
407 0 50 33     0 DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408 0 50 33     0 goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409              
410             # On perls that handle script runs, if the UTF-8 interpretation yields
411             # a single script, we guess UTF-8, otherwise just having a mixture of
412             # scripts is suspicious, so guess CP1252. We first strip off, as best
413             # we can, the ASCII characters that look like they are pod directives,
414             # as these would always show as mixed with non-Latin text.
415 0         0 $copy =~ s/$pod_chars_re//g;
416              
417 0 50       0 if ($script_run_re) {
418 0 0       0 goto set_utf8 if $copy =~ $script_run_re;
419 0         0 DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420 0         0 goto set_1252;
421             }
422              
423             # Even without script runs, but on recent enough perls and Unicodes, we
424             # can check if there is a mixture of both Latin and non-Latin. Again,
425             # having a mixture of scripts is suspicious, so assume CP1252
426              
427             # If it's all non-Latin, there is no CP1252, as that is Latin
428             # characters and punct, etc.
429 0 50       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430 0 50       0 goto set_utf8 if $copy !~ $latin_re;
431              
432 0 100       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433 0 100       0 goto set_utf8 if $copy =~ $every_char_is_latin_re;
434              
435 0         0 DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436              
437 10         13 set_1252:
438             DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439 10         24 $encoding = 'CP1252';
440 16         46 goto done_set;
441              
442 6         25 set_utf8:
443             DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444 6         40 $encoding = 'UTF-8';
445              
446 16         107 done_set:
447             $self->_handle_encoding_line( "=encoding $encoding" );
448 16         75 delete $self->{'_processed_encoding'};
449 16 50       72 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450              
451 16         198 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
452              
453             $self->whine(
454 16         109 $self->{'line_count'},
455             "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456             );
457             }
458              
459 15322         18813 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460              
461 15322 100       23130 if(!$self->{'in_pod'}) {
462 1274 100       3355 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
463 735 100       1988 if($1 eq 'cut') {
464             $self->scream(
465 9         88 $self->{'line_count'},
466             "=cut found outside a pod block. Skipping to next block."
467             );
468              
469             ## Before there were errata sections in the world, it was
470             ## least-pessimal to abort processing the file. But now we can
471             ## just barrel on thru (but still not start a pod block).
472             #splice @_;
473             #push @_, undef;
474              
475 9         25 next;
476             } else {
477             $self->{'in_pod'} = $self->{'start_of_pod_block'}
478 726         1876 = $self->{'last_was_blank'} = 1;
479             # And fall thru to the pod-mode block further down
480             }
481             } else {
482 539         710 DEBUG > 5 and print STDERR "# It's a code-line.\n";
483 539 100       910 $code_handler->(map $_, $line, $self->{'line_count'}, $self)
484             if $code_handler;
485             # Note: this may cause code to be processed out of order relative
486             # to pods, but in order relative to cuts.
487              
488             # Note also that we haven't yet applied the transcoding to $line
489             # by time we call $code_handler!
490              
491 544 50       1020 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
492             # That RE is from perlsyn, section "Plain Old Comments (Not!)",
493             #$fname = $2 if defined $2;
494             #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495 5         31 DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496 5         46 $self->{'line_count'} = $1 - 1;
497             }
498              
499 544         996 next;
500             }
501             }
502              
503             # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504             # Else we're in pod mode:
505              
506             # Apply any necessary transcoding:
507 14774 100       22917 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508              
509             # HERE WE CATCH =encoding EARLY!
510 14780 100       22844 if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511 14 100       78 next if $self->parse_characters; # Ignore this line
512 14         74 $line = $self->_handle_encoding_line( $line );
513             }
514              
515 14777 100       47339 if($line =~ m/^=cut/s) {
    100          
    100          
516             # here ends the pod block, and therefore the previous pod para
517 123         196 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518 130         267 $self->{'in_pod'} = 0;
519             # ++$self->{'pod_para_count'};
520 130         351 $self->_ponder_paragraph_buffer();
521             # by now it's safe to consider the previous paragraph as done.
522 130         211 DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523 130 100       505 $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524             if $cut_handler;
525              
526             # TODO: add to docs: Note: this may cause cuts to be processed out
527             # of order relative to pods, but in order relative to code.
528              
529             } elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530 4670 100 66     15791 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531 2817 100       3691 $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532             if $wl_handler;
533             }
534              
535 7473 100 66     24072 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      100        
536 951         1651 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537 732         1325 push @{$paras->[-1]}, $line;
  523         1139  
538             } # otherwise it's not interesting
539              
540 4659 100 100     13265 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541 4827         5655 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542             }
543              
544 4878         6926 $self->{'last_was_blank'} = 1;
545              
546             } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547              
548 5410 100       14218 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) {
    100          
549             # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550 2554         9156 my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551 2335 100 100     8224 $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552             # Note that in "=head1 foo", the WS is lost.
553             # Example: ['=head1', {'start_line' => 123}, ' foo']
554              
555 2335         3349 ++$self->{'pod_para_count'};
556              
557 2554         6082 $self->_ponder_paragraph_buffer();
558             # by now it's safe to consider the previous paragraph as done.
559              
560 4930         9869 push @$paras, $new; # the new incipient paragraph
561 4930         7717 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562              
563             } elsif($line =~ m/^\s/s) {
564              
565 653 100 33     3086 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      66        
566 169         263 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567 2732         9506 push @{$paras->[-1]}, $line;
  184         356  
568             } else {
569 529         754 ++$self->{'pod_para_count'};
570 529         1305 $self->_ponder_paragraph_buffer();
571             # by now it's safe to consider the previous paragraph as done.
572 529         711 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573 529         1939 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574             }
575             } else {
576 3304         6941 ++$self->{'pod_para_count'};
577 2256         5171 $self->_ponder_paragraph_buffer();
578             # by now it's safe to consider the previous paragraph as done.
579 3304         11643 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580 2343         3303 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581             }
582 5299         9694 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583              
584             } else {
585             # It's a non-blank line /continuing/ the current para
586 4912 50       7351 if(@$paras) {
587 5873         8066 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588 5819         6208 push @{$paras->[-1]}, $line;
  5873         10932  
589             } else {
590             # Unexpected case!
591 1146         3262 die "Continuing a paragraph but \@\$paras is empty?";
592             }
593 5322         9957 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594             }
595              
596             } # ends the big while loop
597              
598 8153         11266 DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599 8153         37390 return $self;
600             }
601              
602             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603              
604             sub _handle_encoding_line {
605 535     18   1422 my($self, $line) = @_;
606              
607 535 50       996 return if $self->parse_characters;
608              
609             # The point of this routine is to set $self->{'_transcoder'} as indicated.
610              
611 535 50       827 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612 111         534 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613              
614 52         90 my $e = $1;
615 52         87 my $orig = $e;
616 52         105 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
  76         161  
617              
618 76         165 my $enc_error;
619              
620             # Cf. perldoc Encode and perldoc Encode::Supported
621              
622 76         3593 require Pod::Simple::Transcode;
623              
624 76 100 33     301 if( $self->{'encoding'} ) {
    50          
    100          
625 534         725 my $norm_current = $self->{'encoding'};
626 534         1276 my $norm_e = $e;
627 534         1895 foreach my $that ($norm_current, $norm_e) {
628 534         778 $that = lc($that);
629 1146         2161 $that =~ s/[-_]//g;
630             }
631 333 100       601 if($norm_current eq $norm_e) {
632 333         389 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633             "redundant. ($norm_current eq $norm_e). Ignoring.\n";
634 333         437 $enc_error = '';
635             # But that doesn't necessarily mean that the earlier one went okay
636             } else {
637 333         721 $enc_error = "Encoding is already set to " . $self->{'encoding'};
638 0         0 DEBUG > 1 and print STDERR $enc_error;
639             }
640             } elsif (
641             # OK, let's turn on the encoding
642             do {
643 350         756 DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644 1853         2088 $self->{'encoding'} = $e;
645 1853         8644 1;
646             }
647             and $e eq 'HACKRAW'
648             ) {
649 42         109 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650              
651             } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652              
653             die($enc_error = "WHAT? _transcoder is already set?!")
654 59 50       7781 if $self->{'_transcoder'}; # should never happen
655 59         292 require Pod::Simple::Transcode;
656 59         159 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657 59         139 eval {
658 59         129 my @x = ('', "abc", "123");
659 59         126 $self->{'_transcoder'}->(@x);
660             };
661 59 50       222 $@ && die( $enc_error =
662             "Really unexpected error setting up encoding $e: $@\nAborting"
663             );
664 59         124 $self->{'detected_encoding'} = $e;
665              
666             } else {
667 42         2770 my @supported = Pod::Simple::Transcode::->all_encodings;
668              
669             # Note unsupported, and complain
670 42         134 DEBUG and print STDERR " Encoding [$e] is unsupported.",
671             "\nSupporteds: @supported\n";
672 5         13 my $suggestion = '';
673              
674             # Look for a near match:
675 5         9 my $norm = lc($e);
676 5         12 $norm =~ tr[-_][]d;
677 10         23 my $n;
678 10         46 foreach my $enc (@supported) {
679 5         16 $n = lc($enc);
680 3         7 $n =~ tr[-_][]d;
681 3 50       5 next unless $n eq $norm;
682 2         7 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683 2         3 last;
684             }
685 37         85 my $encmodver = Pod::Simple::Transcode::->encmodver;
686 37         122 $enc_error = join '' =>
687             "This document probably does not appear as it should, because its ",
688             "\"=encoding $e\" line calls for an unsupported encoding.",
689             $suggestion, " [$encmodver\'s supported encodings are: @supported]"
690             ;
691              
692 37         390 $self->scream( $self->{'line_count'}, $enc_error );
693             }
694 17         30 push @{ $self->{'encoding_command_statuses'} }, $enc_error;
  52         21219  
695 52 100       209 if (defined($self->{'_processed_encoding'})) {
696             # Double declaration.
697 35         161 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698             }
699 52         125 $self->{'_processed_encoding'} = $orig;
700              
701 52         138 return $line;
702             }
703              
704             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705              
706             sub _handle_encoding_second_level {
707             # By time this is called, the encoding (if well formed) will already
708             # have been acted on.
709 42     8   118 my($self, $para) = @_;
710 42         123 my @x = @$para;
711 42         119 my $content = join ' ', splice @x, 2;
712 9         1719 $content =~ s/^\s+//s;
713 9         1897 $content =~ s/\s+$//s;
714              
715 9         17 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716              
717 9 100       39 if (defined($self->{'_processed_encoding'})) {
718             #if($content ne $self->{'_processed_encoding'}) {
719             # Could it happen?
720             #}
721 9         20 delete $self->{'_processed_encoding'};
722             # It's already been handled. Check for errors.
723 9 50       46 if(! $self->{'encoding_command_statuses'} ) {
    100          
724 2         6 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725             } elsif( $self->{'encoding_command_statuses'}[-1] ) {
726             $self->whine( $para->[1]{'start_line'},
727             sprintf "Couldn't do %s: %s",
728             $self->{'encoding_command_reqs' }[-1],
729 248         299 $self->{'encoding_command_statuses'}[-1],
730             );
731             } else {
732 255         290 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733             }
734              
735             } else {
736             # Otherwise it's a syntax error
737 248         383 $self->whine( $para->[1]{'start_line'},
738             "Invalid =encoding syntax: $content"
739             );
740             }
741              
742 7         15 return;
743             }
744              
745             #~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746              
747             {
748             my $m = -321; # magic line number
749              
750             sub _gen_errata {
751 667     668   919 my $self = $_[0];
752             # Return 0 or more fake-o paragraphs explaining the accumulated
753             # errors on this document.
754              
755 669 100 66     2073 return() unless $self->{'errata'} and keys %{$self->{'errata'}};
  16         96  
756              
757 16         48 my @out;
758              
759 56         98 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
  69         172  
  56         174  
760             push @out,
761             ['=item', {'start_line' => $m}, "Around line $line:"],
762             map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763             #['~Top', {'start_line' => $m},
764             $_
765             #]
766             ],
767 33         117 @{$self->{'errata'}{$line}}
  74         246  
768             )
769             ;
770             }
771              
772             # TODO: report of unknown entities? unrenderable characters?
773              
774 56         265 unshift @out,
775             ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776             ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777             "Hey! ",
778             ['B', {},
779             'The above document had some coding errors, which are explained below:'
780             ]
781             ],
782             ['=over', {'start_line' => $m, 'errata' => 1}, ''],
783             ;
784              
785 45         123 push @out,
786             ['=back', {'start_line' => $m, 'errata' => 1}, ''],
787             ;
788              
789 45         109 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790              
791 45         141 return @out;
792             }
793              
794             }
795              
796             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797              
798             ##############################################################################
799             ##
800             ## stop reading now stop reading now stop reading now stop reading now stop
801             ##
802             ## HERE IT BECOMES REALLY SCARY
803             ##
804             ## stop reading now stop reading now stop reading now stop reading now stop
805             ##
806             ##############################################################################
807              
808             sub _ponder_paragraph_buffer {
809              
810             # Para-token types as found in the buffer.
811             # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812             # =over, =back, =item
813             # and the null =pod (to be complained about if over one line)
814             #
815             # "~data" paragraphs are something we generate at this level, depending on
816             # a currently open =over region
817              
818             # Events fired: Begin and end for:
819             # directivename (like head1 .. head4), item, extend,
820             # for (from =begin...=end, =for),
821             # over-bullet, over-number, over-text, over-block,
822             # item-bullet, item-number, item-text,
823             # Document,
824             # Data, Para, Verbatim
825             # B, C, longdirname (TODO -- wha?), etc. for all directives
826             #
827              
828 5870     5840   7352 my $self = $_[0];
829 5870         6527 my $paras;
830 5870 100       6583 return unless @{$paras = $self->{'paras'}};
  5870         12570  
831 5146   100     11648 my $curr_open = ($self->{'curr_open'} ||= []);
832              
833 5146         5972 my $scratch;
834              
835 5116         5654 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836              
837             # We have something in our buffer. So apparently the document has started.
838 5120 100       8793 unless($self->{'doc_has_started'}) {
839 707         1730 $self->{'doc_has_started'} = 1;
840              
841 682         1211 my $starting_contentless;
842 712   66     4196 $starting_contentless =
843             (
844             !@$curr_open
845             and @$paras and ! grep $_->[0] ne '~end', @$paras
846             # i.e., if the paras is all ~ends
847             )
848             ;
849 902         1240 DEBUG and print STDERR "# Starting ",
850             $starting_contentless ? 'contentless' : 'contentful',
851             " document\n"
852             ;
853              
854             $self->_handle_element_start(
855             ($scratch = 'Document'),
856             {
857 902 100       4034 'start_line' => $paras->[0][1]{'start_line'},
858             $starting_contentless ? ( 'contentless' => 1 ) : (),
859             },
860             );
861             }
862              
863 5142         7628 my($para, $para_type);
864 5142         8778 while(@$paras) {
865              
866             # If a directive, assume it's legal; subtract below if found not to be
867 7341 100       18324 $seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868              
869             last if @$paras == 1
870             and ( $paras->[0][0] eq '=over'
871             or $paras->[0][0] eq '=item'
872 7316 100 100     27907 or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
      100        
873             # Those're the three kinds of paragraphs that require lookahead.
874             # Actually, an "=item Foo" inside an region
875             # and any =item inside an region (rare)
876             # don't require any lookahead, but all others (bullets
877             # and numbers) do.
878             # The verbatim is different from the other two, because those might be
879             # like:
880             #
881             # =item
882             # ...
883             # =cut
884             # ...
885             # =item
886             #
887             # The =cut here finishes the paragraph but doesn't terminate the =over
888             # they should be in. (khw apologizes that he didn't comment at the time
889             # why the 'in_pod' works, and no longer remembers why, and doesn't think
890             # it is currently worth the effort to re-figure it out.)
891              
892             # TODO: whinge about many kinds of directives in non-resolving =for regions?
893             # TODO: many? like what? =head1 etc?
894              
895 5943         10695 $para = shift @$paras;
896 5944         9227 $para_type = $para->[0];
897              
898 5944         6907 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899             $self->_dump_curr_open(), ")\n";
900              
901 5943 100       16938 if($para_type eq '=for') {
    100          
    100          
    100          
902 74 50       214 next if $self->_ponder_for($para,$curr_open,$paras);
903              
904             } elsif($para_type eq '=begin') {
905 83 50       189 next if $self->_ponder_begin($para,$curr_open,$paras);
906              
907             } elsif($para_type eq '=end') {
908 83 50       216 next if $self->_ponder_end($para,$curr_open,$paras);
909              
910             } elsif($para_type eq '~end') { # The virtual end-document signal
911 2065 50       3747 next if $self->_ponder_doc_end($para,$curr_open,$paras);
912             }
913              
914              
915             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917 6446 100       13173 if(grep $_->[1]{'~ignore'}, @$curr_open) {
918 1398         1624 DEBUG > 1 and
919             print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920 1398         2932 next;
921             }
922             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924              
925 6217 100       14132 if($para_type eq '=pod') {
    100          
    100          
926 1648         2764 $self->_ponder_pod($para,$curr_open,$paras);
927              
928             } elsif($para_type eq '=over') {
929 1325 100       2102 next if $self->_ponder_over($para,$curr_open,$paras);
930              
931             } elsif($para_type eq '=back') {
932 1325 100       2608 next if $self->_ponder_back($para,$curr_open,$paras);
933              
934             } else {
935              
936             # All non-magical codes!!!
937              
938             # Here we start using $para_type for our own twisted purposes, to
939             # mean how it should get treated, not as what the element name
940             # should be.
941              
942 4478         5115 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943              
944 4478         4943 my $i;
945              
946             # Enforce some =headN discipline
947 4478 100 66     11780 if($para_type =~ m/^=head\d$/s
      100        
      66        
948             and ! $self->{'accept_heads_anywhere'}
949             and @$curr_open
950             and $curr_open->[-1][0] eq '=over'
951             ) {
952 221         291 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953             $self->whine(
954 221         1015 $para->[1]{'start_line'},
955             "You forgot a '=back' before '$para_type'"
956             );
957 1169         1933 unshift @$paras, ['=back', {}, ''], $para; # close the =over
958 1169         2024 next;
959             }
960              
961              
962 5917 100 66     18360 if($para_type eq '=item') {
    100          
    100          
    100          
    100          
    100          
    100          
963              
964 2628         7872 my $over;
965 2511 50 33     4773 unless(@$curr_open and
966 2619         6133 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967             $self->whine(
968 1543         2250 $para->[1]{'start_line'},
969             "'=item' outside of any '=over'"
970             );
971             unshift @$paras,
972 1543         4385 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973             $para
974             ;
975 2         52 next;
976             }
977              
978              
979 1038         1705 my $over_type = $over->[1]{'~type'};
980              
981 1041 50       2602 if(!$over_type) {
    50          
    100          
    100          
    50          
982             # Shouldn't happen1
983             die "Typeless over in stack, starting at line "
984 251         715 . $over->[1]{'start_line'};
985              
986             } elsif($over_type eq 'block') {
987 1147 0       2642 unless($curr_open->[-1][1]{'~bitched_about'}) {
988 22         25 $curr_open->[-1][1]{'~bitched_about'} = 1;
989             $self->whine(
990             $curr_open->[-1][1]{'start_line'},
991             "You can't have =items (as at line "
992 22         39 . $para->[1]{'start_line'}
993             . ") unless the first thing after the =over is an =item"
994             );
995             }
996             # Just turn it into a paragraph and reconsider it
997 1125         2592 $para->[0] = '~Para';
998 98         257 unshift @$paras, $para;
999 57         262 next;
1000              
1001             } elsif($over_type eq 'text') {
1002 834         2045 my $item_type = $self->_get_item_type($para);
1003             # That kills the content of the item if it's a number or bullet.
1004 1698         2072 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005              
1006 1698 100 66     2637 if($item_type eq 'text') {
    50          
1007             # Nothing special needs doing for 'text'
1008             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009             $self->whine(
1010 921         2718 $para->[1]{'start_line'},
1011             "Expected text after =item, not a $item_type"
1012             );
1013             # Undo our clobbering:
1014 10         18 push @$para, $para->[1]{'~orig_content'};
1015 10         45 delete $para->[1]{'number'};
1016             # Only a PROPER item-number element is allowed
1017             # to have a number attribute.
1018             } else {
1019 6         19 die "Unhandled item type $item_type"; # should never happen
1020             }
1021              
1022             # =item-text thingies don't need any assimilation, it seems.
1023              
1024             } elsif($over_type eq 'number') {
1025 34         92 my $item_type = $self->_get_item_type($para);
1026             # That kills the content of the item if it's a number or bullet.
1027 939         3813 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028              
1029 82         154 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030              
1031 82 50       331 if($item_type eq 'bullet') {
    50          
    50          
    50          
1032             # Hm, it's not numeric. Correct for this.
1033 54         269 $para->[1]{'number'} = $expected_value;
1034             $self->whine(
1035 0         0 $para->[1]{'start_line'},
1036             "Expected '=item $expected_value'"
1037             );
1038 0         0 push @$para, $para->[1]{'~orig_content'};
1039             # restore the bullet, blocking the assimilation of next para
1040              
1041             } elsif($item_type eq 'text') {
1042             # Hm, it's not numeric. Correct for this.
1043 0         0 $para->[1]{'number'} = $expected_value;
1044             $self->whine(
1045 54         120 $para->[1]{'start_line'},
1046             "Expected '=item $expected_value'"
1047             );
1048             # Text content will still be there and will block next ~Para
1049              
1050             } elsif($item_type ne 'number') {
1051 54         228 die "Unknown item type $item_type"; # should never happen
1052              
1053             } elsif($expected_value == $para->[1]{'number'}) {
1054 28         44 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055              
1056             } else {
1057 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058             " instead of the expected value of $expected_value\n";
1059             $self->whine(
1060             $para->[1]{'start_line'},
1061 0         0 "You have '=item " . $para->[1]{'number'} .
1062             "' instead of the expected '=item $expected_value'"
1063             );
1064 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1065             }
1066              
1067 28 50       77 if(@$para == 2) {
1068             # For the cases where we /didn't/ push to @$para
1069 28 100       63 if($paras->[0][0] eq '~Para') {
1070 25         29 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071 79         220 push @$para, splice @{shift @$paras},2;
  79         149  
1072             } else {
1073 57         178 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074 3         6 push @$para, ''; # Just so it's not contentless
1075             }
1076             }
1077              
1078              
1079             } elsif($over_type eq 'bullet') {
1080 159         436 my $item_type = $self->_get_item_type($para);
1081             # That kills the content of the item if it's a number or bullet.
1082 159         232 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083              
1084 159 50       326 if($item_type eq 'bullet') {
    0          
    0          
1085             # as expected!
1086              
1087 159 100       323 if( $para->[1]{'~_freaky_para_hack'} ) {
1088 101         104 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089 101         240 push @$para, $para->[1]{'~_freaky_para_hack'};
1090             }
1091              
1092             } elsif($item_type eq 'number') {
1093             $self->whine(
1094 0         0 $para->[1]{'start_line'},
1095             "Expected '=item *'"
1096             );
1097 0         0 push @$para, $para->[1]{'~orig_content'};
1098             # and block assimilation of the next paragraph
1099 0         0 delete $para->[1]{'number'};
1100             # Only a PROPER item-number element is allowed
1101             # to have a number attribute.
1102             } elsif($item_type eq 'text') {
1103             $self->whine(
1104 0         0 $para->[1]{'start_line'},
1105             "Expected '=item *'"
1106             );
1107             # But doesn't need processing. But it'll block assimilation
1108             # of the next para.
1109             } else {
1110 0         0 die "Unhandled item type $item_type"; # should never happen
1111             }
1112              
1113 159 100       340 if(@$para == 2) {
1114             # For the cases where we /didn't/ push to @$para
1115 58 50       137 if($paras->[0][0] eq '~Para') {
1116 58         85 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117 58         81 push @$para, splice @{shift @$paras},2;
  58         154  
1118             } else {
1119 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120 0         0 push @$para, ''; # Just so it's not contentless
1121             }
1122             }
1123              
1124             } else {
1125 0         0 die "Unhandled =over type \"$over_type\"?";
1126             # Shouldn't happen!
1127             }
1128              
1129 968         1317 $para_type = 'Plain';
1130 968         2089 $para->[0] .= '-' . $over_type;
1131             # Whew. Now fall thru and process it.
1132              
1133              
1134             } elsif($para_type eq '=extend') {
1135             # Well, might as well implement it here.
1136 0         0 $self->_ponder_extend($para);
1137 0         0 next; # and skip
1138             } elsif($para_type eq '=encoding') {
1139             # Not actually acted on here, but we catch errors here.
1140 7         55 $self->_handle_encoding_second_level($para);
1141 7 100       45 next unless $self->keep_encoding_directive;
1142 4         12 $para_type = 'Plain';
1143             } elsif($para_type eq '~Verbatim') {
1144 483         787 $para->[0] = 'Verbatim';
1145 483         754 $para_type = '?Verbatim';
1146             } elsif($para_type eq '~Para') {
1147 2198         3086 $para->[0] = 'Para';
1148 2198         2922 $para_type = '?Plain';
1149             } elsif($para_type eq 'Data') {
1150 28         44 $para->[0] = 'Data';
1151 28         39 $para_type = '?Data';
1152             } elsif( $para_type =~ s/^=//s
1153             and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154             ) {
1155 565         879 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156             } else {
1157             # An unknown directive!
1158 8         15 $seen_legal_directive--;
1159             DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160 8         10 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161             ;
1162             $self->whine(
1163 8         55 $para->[1]{'start_line'},
1164             "Unknown directive: $para->[0]"
1165             );
1166              
1167             # And maybe treat it as text instead of just letting it go?
1168 8         21 next;
1169             }
1170              
1171 4246 100       12755 if($para_type =~ s/^\?//s) {
1172 2709 100       4878 if(! @$curr_open) { # usual case
1173 1610         1959 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174             } else {
1175 1099         2388 my @fors = grep $_->[0] eq '=for', @$curr_open;
1176             DEBUG > 1 and print STDERR "Containing fors: ",
1177 1099         1261 join(',', map $_->[1]{'target'}, @fors), "\n";
1178              
1179 1099 100       1817 if(! @fors) {
    100          
1180 1111         1413 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181              
1182             #} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183             #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184             } elsif( $fors[-1][1]{'~resolve'} ) {
1185             # Look to the immediately containing for
1186              
1187 74 100       206 if($para_type eq 'Data') {
1188 38         102 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189 38         75 $para->[0] = 'Para';
1190 48         165 $para_type = 'Plain';
1191             } else {
1192 34         108 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193             }
1194             } else {
1195 53         103 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196 75         148 $para->[0] = $para_type = 'Data';
1197             }
1198             }
1199             }
1200              
1201             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202 4299 100       7409 if($para_type eq 'Plain') {
    100          
    50          
1203 4317         8547 $self->_ponder_Plain($para);
1204             } elsif($para_type eq 'Verbatim') {
1205 1055         2255 $self->_ponder_Verbatim($para);
1206             } elsif($para_type eq 'Data') {
1207 24         103 $self->_ponder_Data($para);
1208             } else {
1209 2         4 die "\$para type is $para_type -- how did that happen?";
1210             # Shouldn't happen.
1211             }
1212              
1213             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214 4418         10805 $para->[0] =~ s/^[~=]//s;
1215              
1216 4250         5004 DEBUG and print STDERR "\n", pretty($para), "\n";
1217              
1218             # traverse the treelet (which might well be just one string scalar)
1219             $self->{'content_seen'} ||= 1 if $seen_legal_directive
1220 4250 100 100     16130 && ! $self->{'~tried_gen_errata'};
      100        
1221 4250         9799 $self->_traverse_treelet_bit(@$para);
1222             }
1223             }
1224              
1225 5120         12628 return;
1226             }
1227              
1228             ###########################################################################
1229             # The sub-ponderers...
1230              
1231              
1232              
1233             sub _ponder_for {
1234 934     49   3116 my ($self,$para,$curr_open,$paras) = @_;
1235              
1236             # Fake it out as a begin/end
1237 677         1299 my $target;
1238              
1239 502 50       701 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240 175         467 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241 175         221 return 1;
1242             }
1243              
1244 223         476 for(my $i = 2; $i < @$para; ++$i) {
1245 141 50       347 if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1246 101         208 $target = $1;
1247 49         72 last;
1248             }
1249             }
1250 49 50       90 unless(defined $target) {
1251             $self->whine(
1252 1         2 $para->[1]{'start_line'},
1253             "=for without a target?"
1254             );
1255 52         70 return 1;
1256             }
1257 77         88 DEBUG > 1 and
1258             print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259              
1260 77         133 $para->[0] = 'Data';
1261              
1262             unshift @$paras,
1263             ['=begin',
1264             {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265             $target,
1266             ],
1267             $para,
1268             ['=end',
1269 934         2075 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270             $target,
1271             ],
1272             ;
1273              
1274 851         2043 return 1;
1275             }
1276              
1277             sub _ponder_begin {
1278 108     223   259 my ($self,$para,$curr_open,$paras) = @_;
1279 89         247 my $content = join ' ', splice @$para, 2;
1280 57         148 $content =~ s/^\s+//s;
1281 943         2458 $content =~ s/\s+$//s;
1282 943 50       1335 unless(length($content)) {
1283             $self->whine(
1284 886         3724 $para->[1]{'start_line'},
1285             "=begin without a target?"
1286             );
1287 886         1961 DEBUG and print STDERR "Ignoring targetless =begin\n";
1288 1169         2825 return 1;
1289             }
1290              
1291 59         226 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1292 59 100       203 $para->[1]{'title'} = $title if ($title);
1293 59         155 $para->[1]{'target'} = $target; # without any ':'
1294 57         79 $content = $target; # strip off the title
1295              
1296 57         91 $content =~ s/^:!/!:/s;
1297 59         85 my $neg; # whether this is a negation-match
1298 59 100       146 $neg = 1 if $content =~ s/^!//s;
1299 59         81 my $to_resolve; # whether to process formatting codes
1300 59 100       131 $to_resolve = 1 if $content =~ s/^://s;
1301              
1302 59         77 my $dont_ignore; # whether this target matches us
1303              
1304 57 100       201 foreach my $target_name (
1305             split(',', $content, -1),
1306             $neg ? () : '*'
1307             ) {
1308 116         127 DEBUG > 2 and
1309             print STDERR " Considering whether =begin $content matches $target_name\n";
1310 118 100       265 next unless $self->{'accept_targets'}{$target_name};
1311              
1312 32         41 DEBUG > 2 and
1313             print STDERR " It DOES match the acceptable target $target_name!\n";
1314             $to_resolve = 1
1315 32 100       109 if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316 32         50 $dont_ignore = 1;
1317 100         174 $para->[1]{'target_matching'} = $target_name;
1318 100         202 last; # stop looking at other target names
1319             }
1320              
1321 127 100       257 if($neg) {
1322 89 100       194 if( $dont_ignore ) {
1323 76         129 $dont_ignore = '';
1324 6         11 delete $para->[1]{'target_matching'};
1325 6         7 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326             } else {
1327 13         15 $dont_ignore = 1;
1328 83         282 $para->[1]{'target_matching'} = '!';
1329 83         160 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330             }
1331             }
1332              
1333 127         234 $para->[0] = '=for'; # Just what we happen to call these, internally
1334 127   100     245 $para->[1]{'~really'} ||= '=begin';
1335 127   100     269 $para->[1]{'~ignore'} = (! $dont_ignore) || 0;
1336 127   100     229 $para->[1]{'~resolve'} = $to_resolve || 0;
1337              
1338 127         277 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339             "ignore contents of this region\n";
1340 127         154 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341             ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342 127         236 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343              
1344 127         201 push @$curr_open, $para;
1345 127 100 100     421 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346 153         165 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347             } else {
1348 170 50 100     377 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1349 73         172 $self->_handle_element_start((my $scratch='for'), $para->[1]);
1350             }
1351              
1352 93         255 return 1;
1353             }
1354              
1355             sub _ponder_end {
1356 93     57   151 my ($self,$para,$curr_open,$paras) = @_;
1357 93         200 my $content = join ' ', splice @$para, 2;
1358 93         179 $content =~ s/^\s+//s;
1359 127         232 $content =~ s/\s+$//s;
1360 71         91 DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361              
1362 57 50       117 unless(length($content)) {
1363             $self->whine(
1364             $para->[1]{'start_line'},
1365             "'=end' without a target?" . (
1366             ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367 0 0 0     0 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368             : ''
1369             )
1370             );
1371 0         0 DEBUG and print STDERR "Ignoring targetless =end\n";
1372 14         20 return 1;
1373             }
1374              
1375 71 50       191 unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376             $self->whine(
1377 14         26 $para->[1]{'start_line'},
1378             "'=end $content' is invalid. (Stack: "
1379             . $self->_dump_curr_open() . ')'
1380             );
1381 70         101 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382 70         270 return 1;
1383             }
1384              
1385 127 50 33     415 unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386             $self->whine(
1387 70         169 $para->[1]{'start_line'},
1388             "=end $content without matching =begin. (Stack: "
1389             . $self->_dump_curr_open() . ')'
1390             );
1391 70         77 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392 70         76 return 1;
1393             }
1394              
1395 127 100       198 unless($content eq $curr_open->[-1][1]{'target'}) {
1396             $self->whine(
1397             $para->[1]{'start_line'},
1398             "=end $content doesn't match =begin "
1399 70         120 . $curr_open->[-1][1]{'target'}
1400             . ". (Stack: "
1401             . $self->_dump_curr_open() . ')'
1402             );
1403 70         237 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404 20         22 return 1;
1405             }
1406              
1407             # Else it's okay to close...
1408 107 100       294 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409 70         180 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410             # And that may be because of this to-be-closed =for region, or some
1411             # other one, but it doesn't matter.
1412             } else {
1413 107         281 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414             # what's that for?
1415              
1416 110 50 50     246 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1417 110         299 $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418             }
1419 130         253 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420 130         232 pop @$curr_open;
1421              
1422 130         304 return 1;
1423             }
1424              
1425             sub _ponder_doc_end {
1426 760     2523   1581 my ($self,$para,$curr_open,$paras) = @_;
1427 687 100       1273 if(@$curr_open) { # Deal with things left open
1428 6         10 DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429             $self->_dump_curr_open(), ")\n";
1430              
1431 6         8 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432 79         233 unshift @$paras, $self->_closers_for_all_curr_open;
1433             # Make sure there is exactly one ~end in the parastack, at the end:
1434 6         26 @$paras = grep $_->[0] ne '~end', @$paras;
1435 6         14 push @$paras, $para, $para;
1436             # We need two -- once for the next cycle where we
1437             # generate errata, and then another to be at the end
1438             # when that loop back around to process the errata.
1439 6         33 return 1;
1440              
1441             } else {
1442 754         1184 DEBUG and print STDERR "Okay, stack is empty now.\n";
1443             }
1444              
1445             # Try generating errata section, if applicable
1446 681 100       1490 unless($self->{'~tried_gen_errata'}) {
1447 667         1343 $self->{'~tried_gen_errata'} = 1;
1448 667         1645 my @extras = $self->_gen_errata();
1449 740 100       1751 if(@extras) {
1450 17         54 unshift @$paras, @extras;
1451 17         31 DEBUG and print STDERR "Generated errata... relooping...\n";
1452 17         68 return 1; # I.e., loop around again to process these fake-o paragraphs
1453             }
1454             }
1455              
1456 737         1412 splice @$paras; # Well, that's that for this paragraph buffer.
1457 687         931 DEBUG and print STDERR "Throwing end-document event.\n";
1458              
1459 717         1962 $self->_handle_element_end( my $scratch = 'Document' );
1460 717         2341 return 1; # Hasta la byebye
1461             }
1462              
1463             sub _ponder_pod {
1464 529     521   1126 my ($self,$para,$curr_open,$paras) = @_;
1465             $self->whine(
1466 549 50       1072 $para->[1]{'start_line'},
1467             "=pod directives shouldn't be over one line long! Ignoring all "
1468             . (@$para - 2) . " lines of content"
1469             ) if @$para > 3;
1470              
1471             # Content ignored unless 'pod_handler' is set
1472 549 100       1114 if (my $pod_handler = $self->{'pod_handler'}) {
1473 70         275 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474 251 100       493 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475 251         460 $pod_handler->($line, $line_num, $self);
1476             }
1477              
1478             # The surrounding methods set content_seen, so let us remain consistent.
1479             # I do not know why it was not here before -- should it not be here?
1480             # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1481              
1482 483         1177 return;
1483             }
1484              
1485             sub _ponder_over {
1486 160     187   410 my ($self,$para,$curr_open,$paras) = @_;
1487 160 50       365 return 1 unless @$paras;
1488 160         247 my $list_type;
1489              
1490 160 100       497 if($paras->[0][0] eq '=item') { # most common case
    100          
    50          
1491 147         585 $list_type = $self->_get_initial_item_type($paras->[0]);
1492              
1493             } elsif($paras->[0][0] eq '=back') {
1494             # Ignore empty lists by default
1495 247 100       320 if ($self->{'parse_empty_lists'}) {
1496 247         542 $list_type = 'empty';
1497             } else {
1498 221         378 shift @$paras;
1499 221         616 return 1;
1500             }
1501             } elsif($paras->[0][0] eq '~end') {
1502             $self->whine(
1503 221         490 $para->[1]{'start_line'},
1504             "=over is the last thing in the document?!"
1505             );
1506 26         110 return 1; # But feh, ignore it.
1507             } else {
1508 39         63 $list_type = 'block';
1509             }
1510 182         463 $para->[1]{'~type'} = $list_type;
1511 377         682 push @$curr_open, $para;
1512             # yes, we reuse the paragraph as a stack item
1513              
1514 377         926 my $content = join ' ', splice @$para, 2;
1515 377         959 $para->[1]{'~orig_content'} = $content;
1516 377         869 my $overness;
1517 254 100       978 if($content =~ m/^\s*$/s) {
    50          
1518 164         326 $para->[1]{'indent'} = 4;
1519             } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1520 69     67   435512 no integer;
  67         171  
  67         557  
1521 188         474 $para->[1]{'indent'} = $1;
1522 96 50       400 if($1 == 0) {
1523             $self->whine(
1524 6         15 $para->[1]{'start_line'},
1525             "Can't have a 0 in =over $content"
1526             );
1527 6         72 $para->[1]{'indent'} = 4;
1528             }
1529             } else {
1530             $self->whine(
1531 98         219 $para->[1]{'start_line'},
1532             "=over should be: '=over' or '=over positive_number'"
1533             );
1534 57         141 $para->[1]{'indent'} = 4;
1535             }
1536 213         377 DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537              
1538 213 100 100     723 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1539 213         814 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540              
1541 198         679 return;
1542             }
1543              
1544             sub _ponder_back {
1545 162     377   394 my ($self,$para,$curr_open,$paras) = @_;
1546             # TODO: fire off or or ??
1547              
1548 158         440 my $content = join ' ', splice @$para, 2;
1549 160 50       393 if($content =~ m/\S/) {
1550             $self->whine(
1551 4         19 $para->[1]{'start_line'},
1552             "=back doesn't take any parameters, but you said =back $content"
1553             );
1554             }
1555              
1556 156 50 33     777 if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557 156         210 DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558             # Expected case: we're closing the most recently opened thing
1559             #my $over = pop @$curr_open;
1560 165 100 50     514 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1561             $self->_handle_element_end( my $scratch =
1562 209         795 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563             );
1564             } else {
1565 53         125 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566             join(', ', map $_->[0], @$curr_open), ").\n";
1567             $self->whine(
1568 53         146 $para->[1]{'start_line'},
1569             '=back without =over'
1570             );
1571 53         120 return 1; # and ignore it
1572             }
1573             }
1574              
1575             sub _ponder_item {
1576 53     1378   95 my ($self,$para,$curr_open,$paras) = @_;
1577 53         201 my $over;
1578 53 0 0     127 unless(@$curr_open and
1579 0         0 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580             $self->whine(
1581 0         0 $para->[1]{'start_line'},
1582             "'=item' outside of any '=over'"
1583             );
1584             unshift @$paras,
1585 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586             $para
1587             ;
1588 0         0 return 1;
1589             }
1590              
1591              
1592 0         0 my $over_type = $over->[1]{'~type'};
1593              
1594 0 0       0 if(!$over_type) {
    0          
    0          
    0          
    0          
1595             # Shouldn't happen1
1596             die "Typeless over in stack, starting at line "
1597 53         70 . $over->[1]{'start_line'};
1598              
1599             } elsif($over_type eq 'block') {
1600 53 0       175 unless($curr_open->[-1][1]{'~bitched_about'}) {
1601 53         250 $curr_open->[-1][1]{'~bitched_about'} = 1;
1602             $self->whine(
1603             $curr_open->[-1][1]{'start_line'},
1604             "You can't have =items (as at line "
1605 53         197 . $para->[1]{'start_line'}
1606             . ") unless the first thing after the =over is an =item"
1607             );
1608             }
1609             # Just turn it into a paragraph and reconsider it
1610 53         149 $para->[0] = '~Para';
1611 53         157 unshift @$paras, $para;
1612 53         163 return 1;
1613              
1614             } elsif($over_type eq 'text') {
1615 0         0 my $item_type = $self->_get_item_type($para);
1616             # That kills the content of the item if it's a number or bullet.
1617 53         264 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618              
1619 53 0 0     78 if($item_type eq 'text') {
    0          
1620             # Nothing special needs doing for 'text'
1621             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622             $self->whine(
1623 53         152 $para->[1]{'start_line'},
1624             "Expected text after =item, not a $item_type"
1625             );
1626             # Undo our clobbering:
1627 53         227 push @$para, $para->[1]{'~orig_content'};
1628 0         0 delete $para->[1]{'number'};
1629             # Only a PROPER item-number element is allowed
1630             # to have a number attribute.
1631             } else {
1632 0         0 die "Unhandled item type $item_type"; # should never happen
1633             }
1634              
1635             # =item-text thingies don't need any assimilation, it seems.
1636              
1637             } elsif($over_type eq 'number') {
1638 0         0 my $item_type = $self->_get_item_type($para);
1639             # That kills the content of the item if it's a number or bullet.
1640 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641              
1642 0         0 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643              
1644 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
    0          
1645             # Hm, it's not numeric. Correct for this.
1646 0         0 $para->[1]{'number'} = $expected_value;
1647             $self->whine(
1648 0         0 $para->[1]{'start_line'},
1649             "Expected '=item $expected_value'"
1650             );
1651 0         0 push @$para, $para->[1]{'~orig_content'};
1652             # restore the bullet, blocking the assimilation of next para
1653              
1654             } elsif($item_type eq 'text') {
1655             # Hm, it's not numeric. Correct for this.
1656 0         0 $para->[1]{'number'} = $expected_value;
1657             $self->whine(
1658 0         0 $para->[1]{'start_line'},
1659             "Expected '=item $expected_value'"
1660             );
1661             # Text content will still be there and will block next ~Para
1662              
1663             } elsif($item_type ne 'number') {
1664 0         0 die "Unknown item type $item_type"; # should never happen
1665              
1666             } elsif($expected_value == $para->[1]{'number'}) {
1667 0         0 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668              
1669             } else {
1670 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671             " instead of the expected value of $expected_value\n";
1672             $self->whine(
1673             $para->[1]{'start_line'},
1674 0         0 "You have '=item " . $para->[1]{'number'} .
1675             "' instead of the expected '=item $expected_value'"
1676             );
1677 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1678             }
1679              
1680 0 0       0 if(@$para == 2) {
1681             # For the cases where we /didn't/ push to @$para
1682 0 0       0 if($paras->[0][0] eq '~Para') {
1683 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1685             } else {
1686 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687 0         0 push @$para, ''; # Just so it's not contentless
1688             }
1689             }
1690              
1691              
1692             } elsif($over_type eq 'bullet') {
1693 0         0 my $item_type = $self->_get_item_type($para);
1694             # That kills the content of the item if it's a number or bullet.
1695 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696              
1697 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
1698             # as expected!
1699              
1700 0 0       0 if( $para->[1]{'~_freaky_para_hack'} ) {
1701 0         0 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702 0         0 push @$para, $para->[1]{'~_freaky_para_hack'};
1703             }
1704              
1705             } elsif($item_type eq 'number') {
1706             $self->whine(
1707 0         0 $para->[1]{'start_line'},
1708             "Expected '=item *'"
1709             );
1710 0         0 push @$para, $para->[1]{'~orig_content'};
1711             # and block assimilation of the next paragraph
1712 0         0 delete $para->[1]{'number'};
1713             # Only a PROPER item-number element is allowed
1714             # to have a number attribute.
1715             } elsif($item_type eq 'text') {
1716             $self->whine(
1717 0         0 $para->[1]{'start_line'},
1718             "Expected '=item *'"
1719             );
1720             # But doesn't need processing. But it'll block assimilation
1721             # of the next para.
1722             } else {
1723 0         0 die "Unhandled item type $item_type"; # should never happen
1724             }
1725              
1726 0 0       0 if(@$para == 2) {
1727             # For the cases where we /didn't/ push to @$para
1728 0 0       0 if($paras->[0][0] eq '~Para') {
1729 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1731             } else {
1732 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733 0         0 push @$para, ''; # Just so it's not contentless
1734             }
1735             }
1736              
1737             } else {
1738 0         0 die "Unhandled =over type \"$over_type\"?";
1739             # Shouldn't happen!
1740             }
1741 0         0 $para->[0] .= '-' . $over_type;
1742              
1743 0         0 return;
1744             }
1745              
1746             sub _ponder_Plain {
1747 3743     3745   6068 my ($self,$para) = @_;
1748 3743         4297 DEBUG and print STDERR " giving plain treatment...\n";
1749 3743 100 100     18852 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
      66        
      100        
1750             or $para->[1]{'~cooked'}
1751             ) {
1752             push @$para,
1753 3694         4675 @{$self->_make_treelet(
1754             join("\n", splice(@$para, 2)),
1755 3694         15063 $para->[1]{'start_line'}
1756             )};
1757             }
1758             # Empty paragraphs don't need a treelet for any reason I can see.
1759             # And precooked paragraphs already have a treelet.
1760 3743         6660 return;
1761             }
1762              
1763             sub _ponder_Verbatim {
1764 481     551   899 my ($self,$para) = @_;
1765 481         591 DEBUG and print STDERR " giving verbatim treatment...\n";
1766              
1767 481         1024 $para->[1]{'xml:space'} = 'preserve';
1768              
1769 481 100       1054 unless ($self->{'_output_is_for_JustPod'}) {
1770             # Fix illegal settings for expand_verbatim_tabs()
1771             # This is because this module doesn't do input error checking, but khw
1772             # doesn't want to add yet another instance of that.
1773 400 100 100     1146 $self->expand_verbatim_tabs(8)
1774             if ! defined $self->expand_verbatim_tabs()
1775             || $self->expand_verbatim_tabs() =~ /\D/;
1776              
1777 400         1043 my $indent = $self->strip_verbatim_indent;
1778 400 100 100     931 if ($indent && ref $indent eq 'CODE') {
1779 10         11 my @shifted = (shift @{$para}, shift @{$para});
  10         14  
  10         16  
1780 10         22 $indent = $indent->($para);
1781 10         51 unshift @{$para}, @shifted;
  10         21  
1782             }
1783              
1784 400         954 for(my $i = 2; $i < @$para; $i++) {
1785 1756         2480 foreach my $line ($para->[$i]) { # just for aliasing
1786             # Strip indentation.
1787 1756 100       2620 $line =~ s/^\Q$indent// if $indent;
1788 1756 100       2758 next unless $self->expand_verbatim_tabs;
1789              
1790             # This is commented out because of github issue #85, and the
1791             # current maintainers don't know why it was there in the first
1792             # place.
1793             #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1794 1755         4843 while( $line =~
1795             # Sort of adapted from Text::Tabs.
1796 987         1952 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1797             * $self->expand_verbatim_tabs)
1798             -(length($1)&7)))/e
1799             ) {}
1800              
1801             # TODO: whinge about (or otherwise treat) unindented or overlong lines
1802              
1803             }
1804             }
1805             }
1806              
1807             # Now the VerbatimFormatted hoodoo...
1808 1284 100 66     3445 if( $self->{'accept_codes'} and
    100          
1809             $self->{'accept_codes'}{'VerbatimFormatted'}
1810             ) {
1811 868   100     4772 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
  773         1201  
1812             # Kill any number of terminal newlines
1813 812         3541 $self->_verbatim_format($para);
1814             } elsif ($self->{'codes_in_verbatim'}) {
1815             push @$para,
1816 805         1564 @{$self->_make_treelet(
1817             join("\n", splice(@$para, 2)),
1818 53         113 $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1819             )};
1820 53         73 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1821             } else {
1822 465 100       1880 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1823 465         2229 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1824             }
1825 532         1059 return;
1826             }
1827              
1828             sub _ponder_Data {
1829 73     95   196 my ($self,$para) = @_;
1830 73         144 DEBUG and print STDERR " giving data treatment...\n";
1831 22         45 $para->[1]{'xml:space'} = 'preserve';
1832 22 100       62 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1833 22         46 return;
1834             }
1835              
1836              
1837              
1838              
1839             ###########################################################################
1840              
1841             sub _traverse_treelet_bit { # for use only by the routine above
1842 7015     7266   12888 my($self, $name) = splice @_,0,2;
1843              
1844 7015         7764 my $scratch;
1845 7015         18801 $self->_handle_element_start(($scratch=$name), shift @_);
1846              
1847 7066         13280 while (@_) {
1848 12044         15352 my $x = shift;
1849 12044 100       16539 if (ref($x)) {
1850 3093         5518 &_traverse_treelet_bit($self, @$x);
1851             } else {
1852 9269   100     20859 $x .= shift while @_ && !ref($_[0]);
1853 8953         15729 $self->_handle_text($x);
1854             }
1855             }
1856              
1857 7066         16723 $self->_handle_element_end($scratch=$name);
1858 7052         17085 return;
1859             }
1860              
1861             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1862              
1863             sub _closers_for_all_curr_open {
1864 55     104   208 my $self = $_[0];
1865 43         119 my @closers;
1866 6 50       10 foreach my $still_open (@{ $self->{'curr_open'} || return }) {
  6         37  
1867 9         24 my @copy = @$still_open;
1868 23         75 $copy[1] = {%{ $copy[1] }};
  23         123  
1869             #$copy[1]{'start_line'} = -1;
1870 60 100       126 if($copy[0] eq '=for') {
    50          
1871 35         78 $copy[0] = '=end';
1872             } elsif($copy[0] eq '=over') {
1873             $self->whine(
1874             $still_open->[1]{start_line} ,
1875 38         73 "=over without closing =back"
1876             );
1877              
1878 38         67 $copy[0] = '=back';
1879             } else {
1880 32         74 die "I don't know how to auto-close an open $copy[0] region";
1881             }
1882              
1883 41 50       96 unless( @copy > 2 ) {
1884 1001         2135 push @copy, $copy[1]{'target'};
1885 1001 100       1287 $copy[-1] = '' unless defined $copy[-1];
1886             # since =over's don't have targets
1887             }
1888              
1889 1001         3195 $copy[1]{'fake-closer'} = 1;
1890              
1891 1001         1842 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1892 1126         1631 unshift @closers, \@copy;
1893             }
1894 1123         1924 return @closers;
1895             }
1896              
1897             #--------------------------------------------------------------------------
1898              
1899             sub _verbatim_format {
1900 171     122   380 my($it, $p) = @_;
1901              
1902 1076         2724 my $formatting;
1903              
1904 1076         2271 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1905 1340         3211 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1906 1340         3111 $p->[$i] .= "\n";
1907             # Unlike with simple Verbatim blocks, we don't end up just doing
1908             # a join("\n", ...) on the contents, so we have to append a
1909             # newline to every line, and then nix the last one later.
1910             }
1911              
1912 69         89 if( DEBUG > 4 ) {
1913             print STDERR "<<\n";
1914             for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1915             print STDERR "_verbatim_format $i: $p->[$i]";
1916             }
1917             print STDERR ">>\n";
1918             }
1919              
1920 69         181 for(my $i = $#$p; $i > 2; $i--) {
1921             # work backwards over the lines, except the first (#2)
1922              
1923             #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1924             # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1925             # look at a formatty line preceding a nonformatty one
1926 286         320 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1927 286 100       489 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1928 17         31 DEBUG > 5 and print STDERR " It's a formatty line. ",
1929             "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1930              
1931 17 100       36 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1932 7         27 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1933 7         21 next;
1934             } else {
1935 16         22 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1936             }
1937             } else {
1938 271         304 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1939 271         441 next;
1940             }
1941              
1942             # A formatty line has to have #: in the first two columns, and uses
1943             # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1944             # Example:
1945             # What do you want? i like pie. [or whatever]
1946             # #:^^^^^^^^^^^^^^^^^ /////////////
1947              
1948              
1949 10         20 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1950              
1951 16         57 $formatting = ' ' . $1;
1952 16         49 $formatting =~ s/\s+$//s; # nix trailing whitespace
1953 16 50 33     67 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1954 6         11 splice @$p,$i,1; # remove this line
1955 6         7 $i--; # don't consider next line
1956 6         16 next;
1957             }
1958              
1959 14 100       34 if( length($formatting) >= length($p->[$i-1]) ) {
1960 40         75 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1961             } else {
1962 44         85 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1963             }
1964             # Make $formatting and the previous line be exactly the same length,
1965             # with $formatting having a " " as the last character.
1966              
1967 47         100 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1968              
1969              
1970 220         268 my @new_line;
1971 220         552 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1972             #print STDERR "Format matches $1\n";
1973              
1974 91 100       191 if($2) {
1975             #print STDERR "SKIPPING <$2>\n";
1976 69         291 push @new_line,
1977             substr($p->[$i-1], pos($formatting)-length($1), length($1));
1978             } else {
1979             #print STDERR "SNARING $+\n";
1980 195 50       385 push @new_line, [
    100          
    100          
1981             (
1982             $3 ? 'VerbatimB' :
1983             $4 ? 'VerbatimI' :
1984             $5 ? 'VerbatimBI' : die("Should never get called")
1985             ), {},
1986             substr($p->[$i-1], pos($formatting)-length($1), length($1))
1987             ];
1988             #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1989             }
1990             }
1991 183         327 my @nixed =
1992             splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1993 10         17 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1994              
1995 10         12 DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1996             scalar(@new_line), "):",
1997             map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1998 10         31 $i--; # So the next line we scrutinize is the line before the one
1999             # that we just went and formatted
2000             }
2001              
2002 65         119 $p->[0] = 'VerbatimFormatted';
2003              
2004             # Collapse adjacent text nodes, just for kicks.
2005 65         160 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2006 173 0 0     195 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2007 173         290 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2008 0         0 $p->[$i] .= splice @$p, $i+1, 1; # merge
2009 0         0 --$i; # and back up
2010             }
2011             }
2012              
2013             # Now look for the last text token, and remove the terminal newline
2014 65         193 for( my $i = $#$p; $i >= 2; $i-- ) {
2015             # work backwards over the tokens, even the first
2016 65 50       159 if( !ref($p->[$i]) ) {
2017 65 50       307 if($p->[$i] =~ s/\n$//s) {
2018 65         89 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2019             } else {
2020 0         0 DEBUG > 5 and print STDERR
2021             "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2022             }
2023 65         130 last; # we only want the next one
2024             }
2025             }
2026              
2027 65         131 return;
2028             }
2029              
2030              
2031             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2032              
2033              
2034             sub _treelet_from_formatting_codes {
2035             # Given a paragraph, returns a treelet. Full of scary tokenizing code.
2036             # Like [ '~Top', {'start_line' => $start_line},
2037             # "I like ",
2038             # [ 'B', {}, "pie" ],
2039             # "!"
2040             # ]
2041             # This illustrates the general format of a treelet. It is an array:
2042             # [0] is a scalar indicating its type. In the example above, the
2043             # types are '~Top' and 'B'
2044             # [1] is a hash of various flags about it, possibly empty
2045             # [2] - [N] are an ordered list of the subcomponents of the treelet.
2046             # Scalars are literal text, refs are sub-treelets, to
2047             # arbitrary levels. Stringifying a treelet will recursively
2048             # stringify the sub-treelets, concatentating everything
2049             # together to form the exact text of the treelet.
2050              
2051 3696     3749   7025 my($self, $para, $start_line, $preserve_space) = @_;
2052              
2053 3696         9504 my $treelet = ['~Top', {'start_line' => $start_line},];
2054              
2055 3696 100 100     12299 unless ($preserve_space || $self->{'preserve_whitespace'}) {
2056 2829         20248 $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2057 2829         4691 $para =~ s/ $//;
2058 2829         3356 $para =~ s/^ //;
2059             }
2060              
2061             # Only apparent problem the above code is that N<< >> turns into
2062             # N<< >>. But then, word wrapping does that too! So don't do that!
2063              
2064              
2065             # As a Start-code is encountered, the number of opening bracket '<'
2066             # characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2067             # etc). When closing brackets are found in the text, at least this number
2068             # (plus the 1) will be required to mean the Start-code is terminated. When
2069             # those are found, @stack is popped.
2070 3696         4625 my @stack;
2071              
2072 3696         5525 my @lineage = ($treelet);
2073 3696         4571 my $raw = ''; # raw content of L<> fcode before splitting/processing
2074             # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2075             # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2076             # the 'collapse and trim all whitespace first' lines just above.
2077 3696         4125 my $inL = 0;
2078              
2079 3696         4102 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2080              
2081             # Here begins our frightening tokenizer RE. The following regex matches
2082             # text in four main parts:
2083             #
2084             # * Start-codes. The first alternative matches C< or C<<, the latter
2085             # followed by some whitespace. $1 will hold the entire start code
2086             # (including any space following a multiple-angle-bracket delimiter),
2087             # and $2 will hold only the additional brackets past the first in a
2088             # multiple-bracket delimiter. length($2) + 1 will be the number of
2089             # closing brackets we have to find.
2090             #
2091             # * Closing brackets. Match some amount of whitespace followed by
2092             # multiple close brackets. The logic to see if this closes anything
2093             # is down below. Note that in order to parse C<< >> correctly, we
2094             # have to use look-behind (?<=\s\s), since the match of the starting
2095             # code will have consumed the whitespace.
2096             #
2097             # * A single closing bracket, to close a simple code like C<>.
2098             #
2099             # * Something that isn't a start or end code. We have to be careful
2100             # about accepting whitespace, since perlpodspec says that any whitespace
2101             # before a multiple-bracket closing delimiter should be ignored.
2102             #
2103 3733         33190 while($para =~
2104             m/\G
2105             (?:
2106             # Match starting codes, including the whitespace following a
2107             # multiple-delimiter start code. $1 gets the whole start code and
2108             # $2 gets all but one of the
2109             ([A-Z]<(?:(<+)\s+)?)
2110             |
2111             # Match multiple-bracket end codes. $3 gets the whitespace that
2112             # should be discarded before an end bracket but kept in other cases
2113             # and $4 gets the end brackets themselves. ($3 can be empty if the
2114             # construct is empty, like C<< >>, and all the white-space has been
2115             # gobbled up already, considered to be space after the opening
2116             # bracket. In this case we use look-behind to verify that there are
2117             # at least 2 spaces in a row before the ">".)
2118             (\s+|(?<=\s\s))(>{2,})
2119             |
2120             (\s?>) # $5: simple end-codes
2121             |
2122             ( # $6: stuff containing no start-codes or end-codes
2123             (?:
2124             [^A-Z\s>]
2125             |
2126             (?:
2127             [A-Z](?!<)
2128             )
2129             |
2130             # whitespace is ok, but we don't want to eat the whitespace before
2131             # a multiple-bracket end code.
2132             # NOTE: we may still have problems with e.g. S<< >>
2133             (?:
2134             \s(?!\s*>{2,})
2135             )
2136             )+
2137             )
2138             )
2139             /xgo
2140             ) {
2141 14886         17622 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2142 14849 100       37605 if(defined $1) {
    100          
    100          
    50          
2143 2990         3287 my $bracket_count; # How many '<<<' in a row this has. Needed for
2144             # Pod::Simple::JustPod
2145 2990 100       4555 if(defined $2) {
2146 114         151 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2147 151         337 $bracket_count = length($2) + 1;
2148 151         287 push @stack, $bracket_count; # length of the necessary complex
2149             # end-code string
2150             } else {
2151 2913         3211 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2152 2913         3703 push @stack, 0; # signal that we're looking for simple
2153 2876         3373 $bracket_count = 1;
2154             }
2155 3027         5416 my $code = substr($1,0,1);
2156 3027 100       4571 if ('L' eq $code) {
2157 1180 100       2407 if ($inL) {
2158 748         2090 $raw .= $1;
2159 748         2672 $self->scream( $start_line,
2160             'Nested L<> are illegal. Pretending inner one is '
2161             . 'X<...> so can continue looking for other errors.');
2162 748         3908 $code = "X";
2163             }
2164             else {
2165 1179         2070 $raw = ""; # reset raw content accumulator
2166 1179         1853 $inL = @stack;
2167             }
2168             } else {
2169 3304 100       5012 $raw .= $1 if $inL;
2170             }
2171 3737         7388 push @lineage, [ $code, {}, ]; # new node object
2172              
2173             # Tell Pod::Simple::JustPod how many brackets there were, but to save
2174             # space, not in the most usual case of there was just 1. It can be
2175             # inferred by the absence of this element. Similarly, if there is more
2176             # than one bracket, extract the white space between the final bracket
2177             # and the real beginning of the interior. Save that if it isn't just a
2178             # single space
2179 3737 100 100     8021 if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2180 765         985 $lineage[-1][1]{'~bracket_count'} = $bracket_count;
2181 765         985 my $lspacer = substr($1, 1 + $bracket_count);
2182 765 100       6195 $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2183             }
2184 4023         4636 push @{ $lineage[-2] }, $lineage[-1];
  4023         18656  
2185             } elsif(defined $4) {
2186 210         271 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2187             # This is where it gets messy...
2188 210 100       676 if(! @stack) {
    100          
    100          
    50          
2189             # We saw " >>>>" but needed nothing. This is ALL just stuff then.
2190 2         5 DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2191 2         4 push @{ $lineage[-1] }, $3, $4;
  2         6  
2192 88         101 next;
2193             } elsif(!$stack[-1]) {
2194             # We saw " >>>>" but needed only ">". Back pos up.
2195 87         138 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2196 87         107 push @{ $lineage[-1] }, $3; # That was a for-real space, too.
  88         178  
2197 88         145 pos($para) = pos($para) - length($4) + 1;
2198             } elsif($stack[-1] == length($4)) {
2199             # We found " >>>>", and it was exactly what we needed. Commonest case.
2200 114         151 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2201             } elsif($stack[-1] < length($4)) {
2202             # We saw " >>>>" but needed only " >>". Back pos up.
2203 0         0 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2204 0         0 pos($para) = pos($para) - length($4) + $stack[-1];
2205             } else {
2206             # We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2207 7         17 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2208 7         14 push @{ $lineage[-1] }, $3, $4;
  7         25  
2209 95         201 next;
2210             }
2211             #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2212              
2213 202 50 66     551 if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2214 91 100       232 if ($3 ne "") {
    100          
2215 1         6 $lineage[-1][1]{'~rspacer'} = $3;
2216             }
2217             elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2218              
2219             # Here we had something like C<< >> which was a false positive
2220 1         3 delete $lineage[-1][1]{'~lspacer'};
2221             }
2222             else {
2223             $lineage[-1][1]{'~rspacer'}
2224 1         4 = substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2225 89         107 chop $lineage[-1][1]{'~lspacer'};
2226             }
2227             }
2228              
2229 202 100       681 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  6         11  
  118         298  
2230             # Keep the element from being childless
2231              
2232 114 100       271 if ($inL == @stack) {
2233 22         46 $lineage[-1][1]{'raw'} = $raw;
2234 22         28 $inL = 0;
2235             }
2236              
2237 114         167 pop @stack;
2238 117         169 pop @lineage;
2239              
2240 117 100       609 $raw .= $3.$4 if $inL;
2241              
2242             } elsif(defined $5) {
2243 2974         3068 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2244              
2245 2974 100 100     8275 if(@stack and ! $stack[-1]) {
2246             # We're indeed expecting a simple end-code
2247 2869         3084 DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2248              
2249 2868 50       4930 if(length($5) == 2) { # There was a space there: " >"
    100          
2250 0         0 push @{ $lineage[-1] }, ' ';
  0         0  
2251 2868         5030 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2252 36         61 push @{ $lineage[-1] }, ''; # keep it from being really childless
  36         79  
2253             }
2254              
2255 2872 100       5529 if ($inL == @stack) {
2256 410         962 $lineage[-1][1]{'raw'} = $raw;
2257 410         525 $inL = 0;
2258             }
2259              
2260 2868         3389 pop @stack;
2261 2868         3319 pop @lineage;
2262             } else {
2263 103         277 DEBUG > 4 and print STDERR " It's just stuff.\n";
2264 107         136 push @{ $lineage[-1] }, $5;
  103         259  
2265             }
2266              
2267 2975 100       18390 $raw .= $5 if $inL;
2268              
2269             } elsif(defined $6) {
2270 8770         8988 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2271 8766         9578 push @{ $lineage[-1] }, $6;
  8766         19988  
2272 8770 100       28260 $raw .= $6 if $inL;
2273             # XXX does not capture multiplace whitespaces -- 'raw' ends up with
2274             # at most 1 leading/trailing whitespace, why not all of it?
2275             # Answer, because we deliberately trimmed it above
2276              
2277             } else {
2278             # should never ever ever ever happen
2279 4         6 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2280 4         20 die "SPORK 512512!";
2281             }
2282             }
2283              
2284 3781 100       6465 if(@stack) { # Uhoh, some sequences weren't closed.
2285 93         284 my $x= "...";
2286 92         118 while(@stack) {
2287 92 50       208 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  0         0  
  8         21  
2288             # Hmmmmm!
2289              
2290 92         172 my $code = (pop @lineage)->[0];
2291 13         17 my $ender_length = pop @stack;
2292 13 50       37 if($ender_length) {
2293 84         167 --$ender_length;
2294 0         0 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2295             } else {
2296 8         30 $x = $code . "<$x>";
2297             }
2298             }
2299 92         128 DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2300 92         135 $self->whine($start_line,
2301             "Unterminated $x sequence",
2302             );
2303             }
2304              
2305 3697         9991 return $treelet;
2306             }
2307              
2308             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309              
2310             sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2311 1     0 0 3 return stringify_lol($_[1]);
2312             }
2313              
2314             sub stringify_lol { # function: stringify_lol($lol)
2315 2607     3409 0 3975 my $string_form = '';
2316 2691         5646 _stringify_lol( $_[0] => \$string_form );
2317 3462         8114 return $string_form;
2318             }
2319              
2320             sub _stringify_lol { # the real recursor
2321 3842     3037   5496 my($lol, $to) = @_;
2322 3842         8112 for(my $i = 2; $i < @$lol; ++$i) {
2323 4840 100 100     12574 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
      66        
2324 380         669 _stringify_lol( $lol->[$i], $to); # recurse!
2325             } else {
2326 3604         7916 $$to .= $lol->[$i];
2327             }
2328             }
2329 3733         5786 return;
2330             }
2331              
2332             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2333              
2334             sub _dump_curr_open { # return a string representation of the stack
2335 0     32   0 my $curr_open = $_[0]{'curr_open'};
2336              
2337 0 50       0 return '[empty]' unless @$curr_open;
2338             return join '; ',
2339 0         0 map {;
2340             ($_->[0] eq '=for')
2341             ? ( ($_->[1]{'~really'} || '=over')
2342 0 50 50     0 . ' ' . $_->[1]{'target'})
2343             : $_->[0]
2344             }
2345             @$curr_open
2346             ;
2347             }
2348              
2349             ###########################################################################
2350             my %pretty_form = (
2351             "\a" => '\a', # ding!
2352             "\b" => '\b', # BS
2353             "\e" => '\e', # ESC
2354             "\f" => '\f', # FF
2355             "\t" => '\t', # tab
2356             "\cm" => '\cm',
2357             "\cj" => '\cj',
2358             "\n" => '\n', # probably overrides one of either \cm or \cj
2359             '"' => '\"',
2360             '\\' => '\\\\',
2361             '$' => '\\$',
2362             '@' => '\\@',
2363             '%' => '\\%',
2364             '#' => '\\#',
2365             );
2366              
2367             sub pretty { # adopted from Class::Classless
2368             # Not the most brilliant routine, but passable.
2369             # Don't give it a cyclic data structure!
2370 502     1494 0 2044 my @stuff = @_; # copy
2371 502         620 my $x;
2372             my $out =
2373             # join ",\n" .
2374             join ", ",
2375 502         634 map {;
2376 592 50 100     3394 if(!defined($_)) {
    100 66        
    50 33        
    100          
    100          
    100          
2377 0         0 "undef";
2378             } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2379 58         101 $x = "[ " . pretty(@$_) . " ]" ;
2380 58         111 $x;
2381             } elsif(ref($_) eq 'SCALAR') {
2382 0         0 $x = "\\" . pretty($$_) ;
2383 0         0 $x;
2384             } elsif(ref($_) eq 'HASH') {
2385 795         2444 my $hr = $_;
2386             $x = "{" . join(", ",
2387 48         500 map(pretty($_) . '=>' . pretty($hr->{$_}),
2388             sort keys %$hr ) ) . "}" ;
2389 48         149 $x;
2390 2         3 } elsif(!length($_)) { q{''} # empty string
2391             } elsif(
2392             $_ eq '0' # very common case
2393             or(
2394             m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2395             and $_ ne '-0' # the strange case that RE lets thru
2396             )
2397 26         68 ) { $_;
2398             } else {
2399             # Yes, explicitly name every character desired. There are shorcuts one
2400             # could make, but I (Karl Williamson) was afraid that some Perl
2401             # releases would have bugs in some of them. For example [A-Z] works
2402             # even on EBCDIC platforms to match exactly the 26 uppercase English
2403             # letters, but I don't know if it has always worked without bugs. It
2404             # seemed safest just to list the characters.
2405             # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2406 458 0       883 s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
  0         0  
2407             <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2408 458         1457 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2409             qq{"$_"};
2410             }
2411             } @stuff;
2412 502         1903 # $out =~ s/\n */ /g if length($out) < 75;
2413             return $out;
2414             }
2415              
2416             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2417              
2418             # A rather unsubtle method of blowing away all the state information
2419             # from a parser object so it can be reused. Provided as a utility for
2420             # backward compatibility in Pod::Man, etc. but not recommended for
2421             # general use.
2422              
2423 5     9 0 6926 sub reinit {
2424 5         19 my $self = shift;
2425             foreach (qw(source_dead source_filename doc_has_started
2426             start_of_pod_block content_seen last_was_blank paras curr_open
2427             line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2428             Title)) {
2429 83         123  
2430             delete $self->{$_};
2431             }
2432             }
2433              
2434             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2435             1;
2436