File Coverage

blib/lib/DMS/XS/Parser.pm
Criterion Covered Total %
statement 126 211 59.7
branch 36 96 37.5
condition 6 18 33.3
subroutine 22 41 53.6
pod 0 16 0.0
total 190 382 49.7


line stmt bran cond sub pod time code
1             package DMS::XS::Parser;
2             # XS parser — thin Perl shim over the C DMS parser.
3             #
4             # Public API mirrors DMS::Parser (pure Perl) so the two backends are drop-in
5             # interchangeable. SPEC v0.14 names:
6             #
7             # decode($src) -> body
8             # decode_document($src) -> { meta, body, comments, original_forms }
9             # encode($doc) -> DMS source
10             # encode_lite($doc) -> canonical DMS source
11             #
12             # Old names (parse, parse_document, to_dms, to_dms_lite, ...) remain as
13             # deprecated aliases for one release; they emit a one-time Carp warning
14             # and forward to the new canonical sub.
15             #
16             # Value types returned follow the same conventions as the pure-Perl parser:
17             # strings are unblessed Perl scalars; booleans, integers, floats, and
18             # date/time values are blessed into the DMS::* sentinel classes defined by
19             # DMS::Parser. Maps are Tie::IxHash-tied hashrefs; lists are arrayrefs.
20            
21 3     3   355087 use strict;
  3         8  
  3         106  
22 3     3   16 use warnings;
  3         18  
  3         186  
23 3     3   17 use Carp ();
  3         7  
  3         323  
24             # Tie::IxHash is loaded lazily by Parser.xs on first full-mode parse
25             # (via load_module inside new_ixhash_fast). Lite-mode-only callers
26             # never trigger that load and skip the ~7 ms Tie::IxHash.pm parse.
27             # Full-mode users get the same Document shape as before — Tie::IxHash
28             # methods are defined by the time `parse_document` returns the tied
29             # hash, so `keys %$h` etc. work normally.
30            
31             our $VERSION = '0.3.0';
32            
33             # Capability flag — this port ships lite-mode decode + lite-mode encode_lite.
34             # See SPEC §Parsing modes — full and lite.
35             our $SUPPORTS_LITE_MODE = 1;
36            
37             # Capability flag — this port ships unordered-table parse mode.
38             # See SPEC §Unordered tables.
39             our $SUPPORTS_IGNORE_ORDER = 1;
40            
41             require XSLoader;
42             XSLoader::load('DMS::XS::Parser', $VERSION);
43            
44             # Capture the XSUBs under private aliases so we can redefine the public
45             # `parse_*` names as deprecated Carp::carp wrappers below. The XSUBs
46             # themselves are bound under their original names by the XS module
47             # (renaming them at the C level would require a recompile and break
48             # any old DLL on disk); aliasing into `_xsub_*` lets us reach them
49             # without recursion. SPEC v0.14 rename: parse_* → decode_*.
50             {
51 3     3   21 no strict 'refs';
  3         11  
  3         384  
52             *_xsub_parse_document = \&parse_document;
53             *_xsub_parse_document_lite = \&parse_document_lite;
54             }
55            
56             # Sentinel classes. These mirror DMS::Parser's (pure-Perl) classes so that
57             # encoders and tests work against both backends unchanged. We define them
58             # only if pure-Perl DMS::Parser hasn't already been loaded — otherwise we
59             # inherit its definitions and stay compatible.
60             sub _ensure_classes {
61 3 50   3   17 return if defined &DMS::LocalDate::new;
62 3     3   24 no strict 'refs';
  3         7  
  3         6688  
63             # Typed sentinels are blessed scalar refs (one alloc per value instead
64             # of the three in a blessed-hash shape). Matches the pure-Perl parser.
65 3         9 for my $cls (qw(DMS::LocalDate DMS::LocalTime DMS::LocalDateTime
66             DMS::OffsetDateTime)) {
67 12     0   65 *{"${cls}::new"} = sub { my $v = "$_[1]"; bless \$v, $_[0] };
  12         158  
  0         0  
  0         0  
68 12     0   39 *{"${cls}::value"} = sub { ${ $_[0] } };
  12         47  
  0         0  
  0         0  
69             }
70 3     0   27 *DMS::Float::new = sub { my $v = 0 + $_[1]; bless \$v, $_[0] };
  0         0  
  0         0  
71 3     0   13 *DMS::Float::value = sub { ${ $_[0] } };
  0         0  
  0         0  
72 3     3   14 *DMS::Integer::new = sub { my $v = 0 + $_[1]; bless \$v, $_[0] };
  3         95  
  3         27  
73 3     0   11 *DMS::Integer::value = sub { $_[0] };
  0         0  
74 3     33   10 *DMS::Integer::bstr = sub { "${ $_[0] }" }; # force stringification
  33         39  
  33         96  
75 3     0   10 *DMS::Integer::is_neg = sub { ${ $_[0] } < 0 };
  0         0  
  0         0  
76 3 0   0   17 *DMS::Bool::new = sub { my $v = $_[1]?1:0; bless \$v, $_[0] };
  0         0  
  0         0  
77 3     0   24 *DMS::Bool::value = sub { ${ $_[0] } };
  0         0  
  0         0  
78             # Path-segment marker for list-index breadcrumb steps in the
79             # attached-comment AST. String keys remain plain scalars.
80 3     6   29 *DMS::Index::new = sub { my $v = 0 + $_[1]; bless \$v, $_[0] };
  6         7  
  6         12  
81 3     1   25 *DMS::Index::value = sub { ${ $_[0] } };
  1         3271  
  1         9  
82             # SPEC §"Unordered tables": marker class for body tables produced by
83             # the *_unordered entry points. Underlying storage is a plain Perl
84             # hashref (no Tie::IxHash). `to_dms` (full mode) refuses to round-trip
85             # a Document containing this variant; `to_dms_lite` accepts it.
86             *DMS::UnorderedTable::new = sub {
87 0     0   0 my ($class, $h) = @_;
88 0 0       0 $h = {} unless defined $h;
89 0         0 return bless $h, $class;
90 3         23 };
91             }
92             _ensure_classes();
93            
94             package DMS::XS::Parser;
95            
96             # SPEC §Decode/Encode (v0.14): canonical entry point. Returns the body
97             # only — meta and comments are dropped. Use decode_document() to keep
98             # them.
99             sub decode {
100 2     2 0 5442 my ($src) = @_;
101 2         7 my $doc = decode_document($src);
102 2         10 return $doc->{body};
103             }
104            
105             # SPEC §Parsing modes — full and lite. Body-only lite decode.
106             sub decode_lite {
107 0     0 0 0 my ($src) = @_;
108 0         0 return decode_document_lite($src)->{body};
109             }
110            
111             # SPEC §Front-matter-only decode. Returns the FM table as a hashref
112             # (lite-mode shape — sidecar order list at "\0__dms_keys"), or undef
113             # when the document has no front matter at all. Body bytes after the
114             # closing `+++` are NOT tokenized; bad-body documents with valid FM
115             # succeed.
116             #
117             # Implementation: pre-scan the source in pure Perl to locate the FM
118             # block (or determine its absence), truncate the input to bytes 0..end-
119             # of-closing-`+++`-line, and hand the truncated buffer to the C parser
120             # (`parse_document_lite`). Diagnostics inside the FM block are byte-
121             # identical to a full decode because the leading bytes (and therefore
122             # every line / column inside the block) are unchanged.
123             sub decode_front_matter {
124 11     11 0 207719 my ($src) = @_;
125 11         35 my ($state, $close_end) = _scan_front_matter_bounds($src);
126 11 100       33 if ($state eq 'no_fm') {
127 3         9 return undef;
128             }
129 8         14 my $sub;
130 8 100       19 if ($state eq 'unterminated') {
131             # Hand the original source straight to the C parser; it will
132             # reach EOF inside the FM scan and raise the canonical
133             # "unterminated front matter" error.
134 1         2 $sub = $src;
135             } else { # 'fm'
136             # Truncate to end-of-closing-`+++`-line. The C parser then has
137             # the complete FM block and an empty body; no body bytes get
138             # tokenized, so body errors can't surface. Line/column numbers
139             # inside the FM are byte-identical to a full decode.
140 7         15 $sub = substr($src, 0, $close_end);
141             }
142 8         358 my $doc = _xsub_parse_document_lite($sub);
143 4         24 return $doc->{meta};
144             }
145            
146             # Pre-scan to find the front matter delimiters. Returns one of:
147             # ('no_fm', undef) — no opening `+++` after trivia
148             # ('fm', $end_offset) — open + close found; $end_offset is
149             # the byte offset just past the EOL
150             # that ends the closing `+++` line
151             # ('unterminated',undef) — open found, no close
152             #
153             # Trivia recognized: blank lines (incl. CRLF), `# ...` line comments,
154             # `// ...` line comments, `### ... ###` block comments, `/* ... */`
155             # block comments. The scan only needs to be precise enough to locate
156             # `+++` reliably; it doesn't validate trivia content (the C parser
157             # will catch any malformed trivia when it re-scans the same prefix).
158             sub _scan_front_matter_bounds {
159 11     11   25 my ($src) = @_;
160 11         24 my $len = length($src);
161 11         17 my $i = 0;
162 11         37 while ($i < $len) {
163 11         31 my $c = substr($src, $i, 1);
164             # Inline whitespace.
165 11 50 33     67 if ($c eq ' ' || $c eq "\t") { $i++; next; }
  0         0  
  0         0  
166             # EOL.
167 11 100       30 if ($c eq "\n") { $i++; next; }
  1         2  
  1         3  
168 10 50       28 if ($c eq "\r") {
169 0 0       0 $i += (substr($src, $i, 2) eq "\r\n") ? 2 : 1;
170 0         0 next;
171             }
172             # `### ... ###` block comment.
173 10 50       29 if (substr($src, $i, 3) eq '###') {
174 0         0 my $end = index($src, "###", $i + 3);
175 0 0       0 return ('no_fm', undef) if $end < 0;
176 0         0 $i = $end + 3;
177 0         0 next;
178             }
179             # `# ...` line comment.
180 10 100       25 if ($c eq '#') {
181 1         40 my $nl = index($src, "\n", $i);
182 1 50       7 $i = $nl < 0 ? $len : $nl + 1;
183 1         4 next;
184             }
185             # `// ...` line comment.
186 9 50 33     27 if ($c eq '/' && substr($src, $i, 2) eq '//') {
187 0         0 my $nl = index($src, "\n", $i);
188 0 0       0 $i = $nl < 0 ? $len : $nl + 1;
189 0         0 next;
190             }
191             # `/* ... */` block comment.
192 9 50 33     23 if ($c eq '/' && substr($src, $i, 2) eq '/*') {
193 0         0 my $end = index($src, '*/', $i + 2);
194 0 0       0 return ('no_fm', undef) if $end < 0;
195 0         0 $i = $end + 2;
196 0         0 next;
197             }
198 9         19 last;
199             }
200             # Now check for `+++` opener on its own line.
201 11 100       37 return ('no_fm', undef) if $i + 3 > $len;
202 9 100       24 return ('no_fm', undef) if substr($src, $i, 3) ne '+++';
203 8         16 my $j = $i + 3;
204             # Optional trailing inline whitespace, then EOL or EOF.
205 8         20 while ($j < $len) {
206 8         17 my $c = substr($src, $j, 1);
207 8 50 33     35 last if $c ne ' ' && $c ne "\t";
208 0         0 $j++;
209             }
210 8 50       19 if ($j < $len) {
211 8         15 my $c = substr($src, $j, 1);
212 8 50 33     24 if ($c ne "\n" && $c ne "\r") {
213             # `+++` followed by other content on same line is not an
214             # opener (per SPEC §Front matter).
215 0         0 return ('no_fm', undef);
216             }
217             }
218             # Search for the closing `+++` line. Each candidate must be `+++`
219             # on its own line, optionally surrounded by inline whitespace.
220             # We walk line-by-line starting from `$j` (the EOL after the open).
221 8         13 my $p = $j;
222 8         19 while ($p < $len) {
223             # Skip the EOL we're sitting on.
224 17         54 my $c = substr($src, $p, 1);
225 17 50       39 if ($c eq "\n") { $p++; }
  17 0       25  
226             elsif ($c eq "\r") {
227 0 0       0 $p += (substr($src, $p, 2) eq "\r\n") ? 2 : 1;
228             }
229 17 100       40 last if $p >= $len;
230             # Find end of this line.
231 16         26 my $line_start = $p;
232 16         31 my $nl = index($src, "\n", $p);
233 16 50       34 my $line_end = $nl < 0 ? $len : $nl;
234             # If line_end - 1 is `\r`, the line proper ends one before.
235 16         39 my $line_end_no_cr = $line_end;
236 16 50 33     63 if ($line_end > $line_start
237             && substr($src, $line_end - 1, 1) eq "\r") {
238 0         0 $line_end_no_cr = $line_end - 1;
239             }
240 16         36 my $line = substr($src, $line_start, $line_end_no_cr - $line_start);
241 16         27 my $trimmed = $line;
242 16         50 $trimmed =~ s/^[ \t]+//;
243 16         46 $trimmed =~ s/[ \t]+$//;
244 16 100       40 if ($trimmed eq '+++') {
245             # Include the EOL after the closing `+++` in the truncated
246             # range, so the C parser sees a complete line.
247 7 50       16 my $close_end = $nl < 0 ? $len : $nl + 1;
248 7         27 return ('fm', $close_end);
249             }
250 9         24 $p = $line_end; # advance to the EOL position
251             }
252 1         4 return ('unterminated', undef);
253             }
254            
255             # decode_document and decode_document_lite forward to the XS-defined
256             # subs (still bound under their original names in Parser.xs, captured
257             # above as _xsub_parse_document / _xsub_parse_document_lite). The
258             # rename happens Perl-side so DMS-XS keeps loading any existing .dll
259             # without a rebuild.
260 39     39 0 433322 sub decode_document { goto &_xsub_parse_document }
261 0     0 0 0 sub decode_document_lite { goto &_xsub_parse_document_lite }
262            
263             # Re-emit a parsed Document as DMS source. See SPEC §encode.
264             #
265             # Note: the underlying C parser does not yet record `original_forms`
266             # (integer-base / string-form preservation lives in the pure-Perl port).
267             # When `original_forms` is missing, the emitter falls back to defaults:
268             # integers render as canonical decimal, strings as basic-quoted. Comments
269             # and data structure are preserved via the C parser's existing comment AST.
270             sub encode {
271 19     19 0 7084 my ($doc) = @_;
272 19         1110 require DMS::Emitter;
273 19         62 return DMS::Emitter::encode($doc);
274             }
275            
276             # Lite-mode emit: canonical DMS source — no comments, decimal integers,
277             # basic-quoted strings — ignoring any comments / original_forms in $doc.
278             # `decode(encode_lite($doc))` is data-equivalent to $doc; round-trip of
279             # comment + literal-form is *not* preserved. SPEC §encode.
280             #
281             # Implemented entirely in Perl (DMS::Emitter is shared between the
282             # pure-Perl and XS backends — same Document shape, same walk).
283             sub encode_lite {
284 2     2 0 1001 my ($doc) = @_;
285             # Fast path: C-side lite emitter walks the Perl tree and writes DMS
286             # bytes directly. Skips the per-kvpair Perl-VM trips of the pure-Perl
287             # Emitter. Falls back to the pure-Perl path when the XS function
288             # isn't available (older builds) or when the document contains a
289             # DMS::UnorderedTable that the C path doesn't yet handle specially
290             # — for the bench's normalized fixture, this is the path.
291 2 50       10 if (defined &encode_lite_xs) {
292 0         0 return encode_lite_xs($doc);
293             }
294 2 50       8 if (defined &to_dms_lite_xs) {
295             # Backward compat — pre-rebuild XS exposes the old XSUB name.
296 2         65 return to_dms_lite_xs($doc);
297             }
298 0         0 require DMS::Emitter;
299 0         0 return DMS::Emitter::encode_lite($doc);
300             }
301            
302             # Deprecated aliases (SPEC v0.14: parse → decode, to_dms → encode).
303             # Removed in the next release. Each warns once per process via Carp.
304             { my $warned;
305             sub parse {
306 1 50   1 0 10 unless ($warned++) {
307 1         247 Carp::carp(
308             'DMS::XS::Parser::parse() is deprecated; use decode() instead. '
309             . 'SPEC v0.14 renamed parse() to decode().');
310             }
311 1         11 goto &decode;
312             }
313             }
314             { my $warned;
315             sub parse_lite {
316 0 0   0 0 0 unless ($warned++) {
317 0         0 Carp::carp(
318             'DMS::XS::Parser::parse_lite() is deprecated; use decode_lite() instead. '
319             . 'SPEC v0.14 renamed parse_lite() to decode_lite().');
320             }
321 0         0 goto &decode_lite;
322             }
323             }
324             { my $warned;
325             sub to_dms {
326 1 50   1 0 82 unless ($warned++) {
327 1         143 Carp::carp(
328             'DMS::XS::Parser::to_dms() is deprecated; use encode() instead. '
329             . 'SPEC v0.14 renamed to_dms() to encode().');
330             }
331 1         13 goto &encode;
332             }
333             }
334             { my $warned;
335             sub to_dms_lite {
336 1 50   1 0 9 unless ($warned++) {
337 1         167 Carp::carp(
338             'DMS::XS::Parser::to_dms_lite() is deprecated; use encode_lite() instead. '
339             . 'SPEC v0.14 renamed to_dms_lite() to encode_lite().');
340             }
341 1         12 goto &encode_lite;
342             }
343             }
344            
345             # Helper matching DMS::Parser::new_table — returns an IxHash-tied hashref.
346             sub new_table {
347 0     0 0 0 tie my %h, 'Tie::IxHash';
348 0         0 return \%h;
349             }
350            
351             # SPEC §"Unordered tables" — opt-in. The underlying C parser builds
352             # Tie::IxHash tied tables; rather than fork the C code, we walk the
353             # returned tree post-parse and replace every body table with a plain
354             # DMS::UnorderedTable hashref (insertion-order tracking dropped). Front
355             # matter is intentionally left alone — meta stays ordered per spec.
356             sub decode_document_unordered {
357 0     0 0 0 my ($src) = @_;
358 0         0 my $doc = _xsub_parse_document($src);
359 0 0       0 $doc->{body} = _to_unordered($doc->{body}) if defined $doc->{body};
360 0         0 return $doc;
361             }
362            
363             sub decode_lite_document_unordered {
364 0     0 0 0 my ($src) = @_;
365 0         0 my $doc = _xsub_parse_document_lite($src);
366 0 0       0 $doc->{body} = _to_unordered($doc->{body}) if defined $doc->{body};
367 0         0 return $doc;
368             }
369            
370             # Deprecated aliases (SPEC v0.14). Removed in the next release.
371             # Deprecated wrappers redefining the XSUB-bound `parse_document` /
372             # `parse_document_lite` in the symbol table. They warn once per process
373             # and forward to the captured XSUB (no recursion).
374             { my $warned;
375 3     3   33 no warnings 'redefine';
  3         5  
  3         662  
376             *parse_document = sub {
377 1 50   1   17 unless ($warned++) {
378 1         164 Carp::carp(
379             'DMS::XS::Parser::parse_document() is deprecated; '
380             . 'use decode_document() instead. '
381             . 'SPEC v0.14 renamed parse_document() to decode_document().');
382             }
383 1         28 goto &_xsub_parse_document;
384             };
385             }
386             { my $warned;
387 3     3   26 no warnings 'redefine';
  3         7  
  3         1977  
388             *parse_document_lite = sub {
389 0 0   0     unless ($warned++) {
390 0           Carp::carp(
391             'DMS::XS::Parser::parse_document_lite() is deprecated; '
392             . 'use decode_document_lite() instead. '
393             . 'SPEC v0.14 renamed parse_document_lite() to decode_document_lite().');
394             }
395 0           goto &_xsub_parse_document_lite;
396             };
397             }
398            
399             { my $warned;
400             sub parse_document_unordered {
401 0 0   0 0   unless ($warned++) {
402 0           Carp::carp(
403             'DMS::XS::Parser::parse_document_unordered() is deprecated; '
404             . 'use decode_document_unordered() instead. '
405             . 'SPEC v0.14 renamed parse_*() to decode_*().');
406             }
407 0           goto &decode_document_unordered;
408             }
409             }
410             { my $warned;
411             sub parse_lite_document_unordered {
412 0 0   0 0   unless ($warned++) {
413 0           Carp::carp(
414             'DMS::XS::Parser::parse_lite_document_unordered() is deprecated; '
415             . 'use decode_lite_document_unordered() instead. '
416             . 'SPEC v0.14 renamed parse_*() to decode_*().');
417             }
418 0           goto &decode_lite_document_unordered;
419             }
420             }
421            
422             # Recursive walk: convert each table (plain or Tie::IxHash-tied hash) to
423             # a DMS::UnorderedTable plain hashref. Lists are descended into; blessed
424             # leaves (DMS::Integer / Float / Bool / dates) are preserved as-is.
425             sub _to_unordered {
426 0     0     my ($v) = @_;
427 0 0         return $v if !defined $v;
428 0           my $r = ref($v);
429 0 0         return $v if $r eq '';
430             # Blessed sentinels: leaves. (DMS::UnorderedTable shouldn't appear
431             # here at all — the XS parser doesn't produce it — but if it does we
432             # leave it.)
433 0           require Scalar::Util;
434 0 0         if (Scalar::Util::blessed($v)) {
435 0 0         return $v if $r ne 'DMS::UnorderedTable';
436             # Already unordered — recurse into children for safety.
437 0           my %h;
438 0           for my $k (keys %$v) {
439 0           $h{$k} = _to_unordered($v->{$k});
440             }
441 0           return bless \%h, 'DMS::UnorderedTable';
442             }
443 0 0         if ($r eq 'ARRAY') {
444 0           return [ map { _to_unordered($_) } @$v ];
  0            
445             }
446 0 0         if ($r eq 'HASH') {
447             # Tie::IxHash-tied hash or plain hash — either way, walk via
448             # `keys` (tied yields insertion order; plain yields hash order)
449             # and rebuild as a plain blessed UnorderedTable. We drop the
450             # Tie::IxHash magic by copying into a fresh `%h`.
451 0           my %h;
452 0           for my $k (keys %$v) {
453 0           $h{$k} = _to_unordered($v->{$k});
454             }
455 0           return bless \%h, 'DMS::UnorderedTable';
456             }
457 0           return $v;
458             }
459            
460             1;