F2.2: std/json reader — explicit-alloc parse with error surfacing
Add the JSON reader (parser) to library/modules/std/json.sx, the inverse of the F2.1 writer over the same value model: insertion-ordered objects, arrays, strings (full unescaping incl. \uXXXX + surrogate pairs), s64 integers, bool, null. Heap discipline (binding): exactly two allocation kinds, both through the EXPLICIT `alloc` parameter, never the implicit context allocator — composite backing stores (Array/Object.items via add/put) and decoded escaped-string buffers (bounded by the raw span). Un-escaped string values are zero-copy VIEWS into the input buffer (valid only while it lives); scalars carry no heap. Failure surfacing (hard contract): malformed input raises a meaningful JsonParseError variant (UnexpectedToken / UnexpectedEnd / BadEscape / BadNumber / TrailingGarbage) on the error channel, never a bogus value. Trailing non-whitespace is TrailingGarbage; fractions/exponents, out-of-s64 magnitudes, and leading zeros are BadNumber. Number accumulation runs in negative space so s64 MIN parses exactly. examples/0714-modules-json-reader.sx asserts the parsed structure (insertion order, every kind), proves the view-vs-decoded heap split by pointer containment, round-trips back through the writer byte-for-byte, decodes a surrogate-pair into 4 UTF-8 bytes, and checks every malformed variant. Filed issues/0078: a string `==` (or any sub-CFG operand) used in a short-circuit `and`/`or` emits invalid LLVM IR (stale PHI predecessor), hit while writing the example's assertions and worked around there by not combining comparisons with `and`/`or`. src/ untouched.
This commit is contained in:
137
examples/0714-modules-json-reader.sx
Normal file
137
examples/0714-modules-json-reader.sx
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
// JSON reader (parser) from `modules/std/json.sx` — the inverse of the
|
||||||
|
// F2.1 writer.
|
||||||
|
//
|
||||||
|
// Parses a representative document (nested object + array + a
|
||||||
|
// string-with-escapes + ints incl. negatives + bool + null) into the
|
||||||
|
// shared value model, then proves:
|
||||||
|
//
|
||||||
|
// 1. STRUCTURE — the parsed tree has the expected keys (in INSERTION
|
||||||
|
// order), values, and nesting.
|
||||||
|
// 2. HEAP DISCIPLINE — an un-escaped string value is a zero-copy VIEW
|
||||||
|
// into the input buffer (its bytes lie inside `src`), while an
|
||||||
|
// escaped string is DECODED into a fresh `alloc`-ed buffer (its
|
||||||
|
// bytes lie OUTSIDE `src`). Composite nodes + the decoded string are
|
||||||
|
// the only allocations, all through the explicit Arena.
|
||||||
|
// 3. ROUND-TRIP — feeding the parsed tree back to the writer reproduces
|
||||||
|
// the canonical input byte-for-byte.
|
||||||
|
// 4. UNICODE — `\uXXXX` (BMP + 2-byte) and a surrogate pair decode to
|
||||||
|
// the right UTF-8 bytes.
|
||||||
|
// 5. FAILURE SURFACING — every malformed input raises the right
|
||||||
|
// `JsonParseError` variant on the error channel, never a bogus value.
|
||||||
|
|
||||||
|
#import "modules/std.sx";
|
||||||
|
#import "modules/std/json.sx";
|
||||||
|
|
||||||
|
// Canonical document: no insignificant whitespace, escapes in the writer's
|
||||||
|
// own form — so re-serializing the parse must reproduce it exactly.
|
||||||
|
DOC :: "{\"name\":\"plain\",\"esc\":\"a\\nb\",\"xs\":[10,-20],\"yes\":true,\"nil\":null,\"sub\":{\"k\":\"v\"}}";
|
||||||
|
|
||||||
|
report :: (label: string, ok: bool) {
|
||||||
|
if ok { print("{}: ok\n", label); } else { print("{}: FAIL\n", label); }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Half-open containment [lo, hi). Written with early returns (no `and`) so
|
||||||
|
// the assertions below never combine comparisons with short-circuit
|
||||||
|
// `and`/`or` — see issues/0078.
|
||||||
|
in_range :: (x: s64, lo: s64, hi: s64) -> bool {
|
||||||
|
if x < lo { return false; }
|
||||||
|
if x >= hi { return false; }
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// True when `parse(src)` raised `want` — destructure captures the error
|
||||||
|
// tag without `try`, so a malformed input never aborts the example.
|
||||||
|
raises :: (src: string, want: JsonParseError, alloc: Allocator) -> bool {
|
||||||
|
_, e := parse(src, alloc);
|
||||||
|
e == want
|
||||||
|
}
|
||||||
|
|
||||||
|
main :: () -> ! {
|
||||||
|
gpa := GPA.init();
|
||||||
|
arena := Arena.init(xx gpa, 8192);
|
||||||
|
defer arena.deinit();
|
||||||
|
|
||||||
|
// ── 1. Structure ─────────────────────────────────────────────────
|
||||||
|
src := DOC;
|
||||||
|
root := try parse(src, xx arena);
|
||||||
|
|
||||||
|
is_object := if root == { case .object: true; else: false; };
|
||||||
|
report("root-is-object", is_object);
|
||||||
|
|
||||||
|
o := root.object;
|
||||||
|
report("member-count", o.len == 6);
|
||||||
|
report("key-order-0", o.items[0].key == "name");
|
||||||
|
report("string-plain", o.items[0].val.str == "plain");
|
||||||
|
report("string-escaped", o.items[1].val.str == "a\nb"); // \n decoded to 0x0A
|
||||||
|
|
||||||
|
xs := o.items[2].val.array;
|
||||||
|
report("array-len", xs.len == 2);
|
||||||
|
report("array-pos", xs.items[0].int_ == 10);
|
||||||
|
report("array-neg", xs.items[1].int_ == 0 - 20);
|
||||||
|
|
||||||
|
report("bool-value", o.items[3].val.bool_ == true);
|
||||||
|
|
||||||
|
is_null := if o.items[4].val == { case .null_: true; else: false; };
|
||||||
|
report("null-value", is_null);
|
||||||
|
|
||||||
|
// Two separate reports (not `key=="k" and val=="v"`): a string `==`
|
||||||
|
// as an operand of short-circuit `and`/`or` miscompiles — see
|
||||||
|
// issues/0078. Every assertion here is therefore a single comparison.
|
||||||
|
sub := o.items[5].val.object;
|
||||||
|
report("nested-key", sub.items[0].key == "k");
|
||||||
|
report("nested-val", sub.items[0].val.str == "v");
|
||||||
|
|
||||||
|
// ── 2. Heap discipline: view vs decoded ──────────────────────────
|
||||||
|
base : s64 = xx src.ptr;
|
||||||
|
stop := base + src.len;
|
||||||
|
p_plain : s64 = xx o.items[0].val.str.ptr; // "plain": no escape -> VIEW into src
|
||||||
|
p_esc : s64 = xx o.items[1].val.str.ptr; // "a\nb": escaped -> DECODED into arena
|
||||||
|
report("plain-is-view", in_range(p_plain, base, stop));
|
||||||
|
report("escaped-allocated", !in_range(p_esc, base, stop));
|
||||||
|
|
||||||
|
// ── 3. Round-trip back through the writer ────────────────────────
|
||||||
|
buf : [256]u8 = ---;
|
||||||
|
n := try write_to_buffer(root, string.{ ptr = @buf[0], len = 256 });
|
||||||
|
rt := string.{ ptr = @buf[0], len = n };
|
||||||
|
report("round-trip", rt == src);
|
||||||
|
|
||||||
|
// ── 4. Leading/trailing/inner whitespace is insignificant ────────
|
||||||
|
// Each comparison is its own report (no `and`-combining — issues/0078).
|
||||||
|
wsv := try parse(" [ 1 , 2 , 3 ] ", xx arena);
|
||||||
|
wa := wsv.array;
|
||||||
|
report("ws-count", wa.len == 3);
|
||||||
|
report("ws-first", wa.items[0].int_ == 1);
|
||||||
|
report("ws-last", wa.items[2].int_ == 3);
|
||||||
|
|
||||||
|
// Empty container literals (the manifest/db.json use these).
|
||||||
|
ea := try parse("[]", xx arena);
|
||||||
|
report("empty-array", ea.array.len == 0);
|
||||||
|
eo := try parse("{}", xx arena);
|
||||||
|
report("empty-object", eo.object.len == 0);
|
||||||
|
|
||||||
|
// ── 5. Unicode: \uXXXX (1- and 2-byte) + surrogate pair (4-byte) ──
|
||||||
|
// JSON "Aé😀" -> 'A', 'é' (C3 A9), '😀' (F0 9F 98 80). One byte per report.
|
||||||
|
univ := try parse("\"\\u0041\\u00e9\\uD83D\\uDE00\"", xx arena);
|
||||||
|
u := univ.str;
|
||||||
|
report("uni-len", u.len == 7);
|
||||||
|
report("uni-A", u[0] == 0x41); // U+0041 -> 1 byte
|
||||||
|
report("uni-e1", u[1] == 0xC3); // U+00E9 -> 2 bytes
|
||||||
|
report("uni-e2", u[2] == 0xA9);
|
||||||
|
report("uni-s0", u[3] == 0xF0); // U+1F600 (surrogate pair) -> 4 bytes
|
||||||
|
report("uni-s1", u[4] == 0x9F);
|
||||||
|
report("uni-s2", u[5] == 0x98);
|
||||||
|
report("uni-s3", u[6] == 0x80);
|
||||||
|
|
||||||
|
// ── 6. Malformed inputs each surface the right error variant ─────
|
||||||
|
report("err-truncated", raises("{\"a\":", error.UnexpectedEnd, xx arena));
|
||||||
|
report("err-bad-escape", raises("\"a\\xb\"", error.BadEscape, xx arena));
|
||||||
|
report("err-trailing-junk", raises("[1,2] x", error.TrailingGarbage, xx arena));
|
||||||
|
report("err-bad-token", raises("xyz", error.UnexpectedToken, xx arena));
|
||||||
|
report("err-fraction", raises("1.5", error.BadNumber, xx arena));
|
||||||
|
report("err-leading-zero", raises("01", error.BadNumber, xx arena));
|
||||||
|
report("err-overflow", raises("9223372036854775808", error.BadNumber, xx arena));
|
||||||
|
report("err-unterminated", raises("\"abc", error.UnexpectedEnd, xx arena));
|
||||||
|
|
||||||
|
print("=== DONE ===\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
1
examples/expected/0714-modules-json-reader.exit
Normal file
1
examples/expected/0714-modules-json-reader.exit
Normal file
@@ -0,0 +1 @@
|
|||||||
|
0
|
||||||
0
examples/expected/0714-modules-json-reader.stderr
Normal file
0
examples/expected/0714-modules-json-reader.stderr
Normal file
37
examples/expected/0714-modules-json-reader.stdout
Normal file
37
examples/expected/0714-modules-json-reader.stdout
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
root-is-object: ok
|
||||||
|
member-count: ok
|
||||||
|
key-order-0: ok
|
||||||
|
string-plain: ok
|
||||||
|
string-escaped: ok
|
||||||
|
array-len: ok
|
||||||
|
array-pos: ok
|
||||||
|
array-neg: ok
|
||||||
|
bool-value: ok
|
||||||
|
null-value: ok
|
||||||
|
nested-key: ok
|
||||||
|
nested-val: ok
|
||||||
|
plain-is-view: ok
|
||||||
|
escaped-allocated: ok
|
||||||
|
round-trip: ok
|
||||||
|
ws-count: ok
|
||||||
|
ws-first: ok
|
||||||
|
ws-last: ok
|
||||||
|
empty-array: ok
|
||||||
|
empty-object: ok
|
||||||
|
uni-len: ok
|
||||||
|
uni-A: ok
|
||||||
|
uni-e1: ok
|
||||||
|
uni-e2: ok
|
||||||
|
uni-s0: ok
|
||||||
|
uni-s1: ok
|
||||||
|
uni-s2: ok
|
||||||
|
uni-s3: ok
|
||||||
|
err-truncated: ok
|
||||||
|
err-bad-escape: ok
|
||||||
|
err-trailing-junk: ok
|
||||||
|
err-bad-token: ok
|
||||||
|
err-fraction: ok
|
||||||
|
err-leading-zero: ok
|
||||||
|
err-overflow: ok
|
||||||
|
err-unterminated: ok
|
||||||
|
=== DONE ===
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
# Symptom
|
||||||
|
|
||||||
|
A string equality (`a == "x"`) used as an operand of a short-circuit
|
||||||
|
`and` / `or` emits LLVM IR that fails verification — the JIT (`sx run`)
|
||||||
|
and AOT paths both abort before running:
|
||||||
|
|
||||||
|
```
|
||||||
|
LLVM verification failed: PHI node entries do not match predecessors!
|
||||||
|
%bp = phi i1 [ false, %entry ], [ %str.eq10, %and.rhs.0 ]
|
||||||
|
label %entry
|
||||||
|
label %str.merge
|
||||||
|
Instruction does not dominate all uses!
|
||||||
|
%str.eq10 = phi i1 [ false, %and.rhs.0 ], [ %str.ceq9, %str.memcmp6 ]
|
||||||
|
%bp = phi i1 [ false, %entry ], [ %str.eq10, %and.rhs.0 ]
|
||||||
|
```
|
||||||
|
|
||||||
|
Integer/`error`-tag equality in the same position is fine — only the
|
||||||
|
string `==` operand miscompiles, because string `==` lowers to its own
|
||||||
|
multi-block memcmp with an internal PHI (`str.eq` ← {`str.memcmp`,
|
||||||
|
short-circuit false}). When that result is then consumed by the `and`/`or`
|
||||||
|
short-circuit merge, the predecessor set the outer PHI records does not
|
||||||
|
match the actual CFG: the string-compare's merge block becomes a
|
||||||
|
predecessor of the `and` merge, but the outer PHI still lists the original
|
||||||
|
`entry`/`and.rhs` edges. The inner `str.eq` PHI also ends up referenced
|
||||||
|
from a block it does not dominate.
|
||||||
|
|
||||||
|
# Reproduction
|
||||||
|
|
||||||
|
```sx
|
||||||
|
#import "modules/std.sx";
|
||||||
|
main :: () {
|
||||||
|
a := "k";
|
||||||
|
b := "v";
|
||||||
|
r := a == "k" and b == "v"; // string == as an `and` operand
|
||||||
|
print("{}\n", r);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./zig-out/bin/sx run repro.sx
|
||||||
|
LLVM verification failed: PHI node entries do not match predecessors!
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
`a == "k" or b == "v"` reproduces it identically (`or.rhs` in place of
|
||||||
|
`and.rhs`). A single `a == "k"` (no `and`/`or`) compiles and runs fine, as
|
||||||
|
does `x == 1 and y == 2` (integer operands). So the trigger is specifically
|
||||||
|
a **string `==`/`!=` as an operand of a short-circuit `and`/`or`** — the
|
||||||
|
operand emits its own `str.memcmp`/`str.merge` sub-CFG, and the
|
||||||
|
short-circuit PHI then records a stale predecessor block.
|
||||||
|
|
||||||
|
A related `match.merge`-predecessor variant of the same PHI mismatch also
|
||||||
|
appears in a LARGER function that mixes several enum-payload accesses
|
||||||
|
(`v.str`/`v.int_`) and `match` expressions with multiple `and`/`or`
|
||||||
|
operations (it surfaced while writing
|
||||||
|
`examples/0714-modules-json-reader.sx`). It did NOT reduce to a small
|
||||||
|
standalone repro — each construct compiles fine in isolation, and a single
|
||||||
|
payload-access operand (`true and e.a == 1`) or a preceding `match`
|
||||||
|
expression followed by an `and` of locals both compile — which points at
|
||||||
|
cumulative basic-block bookkeeping in the `and`/`or` lowering rather than a
|
||||||
|
single local pattern. The string-`==` case above is the reliable minimal
|
||||||
|
reproduction; the broader fix should address PHI predecessor tracking for
|
||||||
|
any `and`/`or` operand that emits intermediate basic blocks.
|
||||||
|
|
||||||
|
# Expected
|
||||||
|
|
||||||
|
`r` should be `true` (both compares hold) and the program print `true`.
|
||||||
|
Generally: a `string ==`/`!=` result must be usable as an operand of
|
||||||
|
`and`/`or` exactly like any other `bool`.
|
||||||
|
|
||||||
|
# Workaround (until fixed)
|
||||||
|
|
||||||
|
Don't combine string equality with `and`/`or` in one expression; split
|
||||||
|
into separate statements / separate boolean locals:
|
||||||
|
|
||||||
|
```sx
|
||||||
|
ok_k := a == "k";
|
||||||
|
ok_v := b == "v";
|
||||||
|
r := ok_k and ok_v; // each string-eq materialized before the short-circuit
|
||||||
|
```
|
||||||
|
|
||||||
|
# Background / where to look
|
||||||
|
|
||||||
|
The string `==` lowering (search `str.eq` / `str.memcmp` / `str.merge`
|
||||||
|
block names in `src/ir/lower.zig`) produces a value via a PHI that joins
|
||||||
|
the memcmp-equal block and the early-out (length-mismatch / short-circuit)
|
||||||
|
block. The boolean `and`/`or` lowering builds its own `and.rhs` /
|
||||||
|
`and.merge` (resp. `or.*`) blocks and a merge PHI. When the LHS (or RHS)
|
||||||
|
of the `and`/`or` is itself a string compare, the outer short-circuit
|
||||||
|
lowering must take the string-compare's *actual current block* (its merge
|
||||||
|
block) as the incoming predecessor for the outer PHI — not the block that
|
||||||
|
was current before the string compare emitted its sub-CFG. The mismatch
|
||||||
|
above is the classic "PHI incoming-block is stale after the operand
|
||||||
|
emitted new basic blocks" bug: the fix is to re-read the builder's current
|
||||||
|
insertion block when wiring the `and`/`or` PHI incoming edges, rather than
|
||||||
|
caching it before lowering the operand. This mirrors the shape of the
|
||||||
|
match-arm PHI fix in issue 0066.
|
||||||
|
|
||||||
|
Discovered while writing the std.json reader regression example
|
||||||
|
(`examples/0714-modules-json-reader.sx`, flow step F2.2): an assertion
|
||||||
|
`key == "k" and val.str == "v"` triggered it. The reader library code
|
||||||
|
itself does not use this pattern; the example was rewritten to assert the
|
||||||
|
two string equalities separately.
|
||||||
|
|
||||||
|
# Verification (once fixed)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./zig-out/bin/sx run repro.sx # prints: true
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a regression example (next free `examples/NNNN-*.sx` slot) that uses a
|
||||||
|
string `==` on both sides of an `and` and on both sides of an `or`, and
|
||||||
|
the full suite + `zig build test` must stay green.
|
||||||
@@ -1,12 +1,13 @@
|
|||||||
// =====================================================================
|
// =====================================================================
|
||||||
// json.sx — JSON value model + writer (stable key order), pure sx.
|
// json.sx — JSON value model + writer + reader (stable key order), pure sx.
|
||||||
//
|
//
|
||||||
// This module delivers the JSON VALUE MODEL and the WRITER. The reader
|
// This module delivers the JSON VALUE MODEL, the WRITER, and the READER
|
||||||
// (parser) lands separately; this file never reads JSON text.
|
// (parser). The model is built once and shared by both directions.
|
||||||
//
|
//
|
||||||
// NUMBERS ARE INTEGERS ONLY (s64) for this milestone — there is no
|
// NUMBERS ARE INTEGERS ONLY (s64) for this milestone — there is no
|
||||||
// fraction or exponent. A JSON value is one of: null, bool, integer,
|
// fraction or exponent. A JSON value is one of: null, bool, integer,
|
||||||
// string, array, object.
|
// string, array, object. The reader REJECTS a fraction or exponent
|
||||||
|
// (`error.BadNumber`) rather than silently truncating it.
|
||||||
//
|
//
|
||||||
// STABLE KEY ORDER: an object is NOT a hash map. It is an ORDERED list
|
// STABLE KEY ORDER: an object is NOT a hash map. It is an ORDERED list
|
||||||
// of (key, value) pairs that preserves INSERTION ORDER. Keys are never
|
// of (key, value) pairs that preserves INSERTION ORDER. Keys are never
|
||||||
@@ -333,3 +334,325 @@ write_to_file :: (v: Value, file: *File, staging: []u8) -> !JsonError {
|
|||||||
try sink.flush();
|
try sink.flush();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Reader (parser) ───────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// `parse(src, alloc)` turns a JSON document in `src` into the value model
|
||||||
|
// above. It is the inverse of the writer for the v0 scope: objects (in
|
||||||
|
// INSERTION ORDER), arrays, strings (with full unescaping incl. \uXXXX
|
||||||
|
// and surrogate pairs), s64 integers, bool, null.
|
||||||
|
//
|
||||||
|
// FAILURE SURFACING (hard contract): every malformed input raises on the
|
||||||
|
// error channel (`!JsonParseError`) — never a bogus or default value.
|
||||||
|
// Trailing non-whitespace after a complete value is `TrailingGarbage`.
|
||||||
|
// `pos` (the parser cursor) marks where the failure was detected.
|
||||||
|
//
|
||||||
|
// NOT SUPPORTED (rejected, not silently accepted): a fraction or exponent
|
||||||
|
// in a number (`1.5`, `1e9`) → `BadNumber`; a number outside s64 →
|
||||||
|
// `BadNumber`; a leading-zero integer (`01`) → `BadNumber`. UNESCAPED raw
|
||||||
|
// control bytes (< 0x20) inside a string are passed through verbatim (the
|
||||||
|
// minimal-reader leniency the manifest / db.json never exercise).
|
||||||
|
//
|
||||||
|
// HEAP DISCIPLINE (binding, see heap-discipline.md). Exactly two kinds of
|
||||||
|
// allocation happen, both through the EXPLICIT `alloc` parameter, never
|
||||||
|
// the implicit context allocator:
|
||||||
|
// 1. Composite backing stores — `Array.items` / `Object.items` grow via
|
||||||
|
// `arr.add(.., alloc)` / `obj.put(.., alloc)` (genuinely unbounded
|
||||||
|
// children; mirrors `List`).
|
||||||
|
// 2. DECODED strings — a string containing escapes must be un-escaped
|
||||||
|
// into fresh storage; that buffer is `alloc`-ed (bounded by the raw
|
||||||
|
// span, since every escape shrinks). A string with NO escapes is a
|
||||||
|
// zero-copy VIEW into `src`; scalars carry no heap.
|
||||||
|
//
|
||||||
|
// OWNERSHIP / LIFETIME: un-escaped string values are SLICES into `src` —
|
||||||
|
// they are valid only while `src` lives. Everything else (nodes, decoded
|
||||||
|
// strings) is owned by `alloc`; free it all by dropping that allocator
|
||||||
|
// (e.g. an Arena `deinit`). A typical caller parses under an Arena and
|
||||||
|
// keeps `src` alive for as long as the tree is used.
|
||||||
|
//
|
||||||
|
// gpa := GPA.init();
|
||||||
|
// arena := Arena.init(xx gpa, 4096);
|
||||||
|
// defer arena.deinit();
|
||||||
|
// root := parse(src, xx arena)!; // composites + decoded strings in arena
|
||||||
|
|
||||||
|
// The reader's failure contract. Meaningful variants so a caller can tell
|
||||||
|
// a truncated document from a bad escape from trailing junk.
|
||||||
|
JsonParseError :: error { UnexpectedToken, UnexpectedEnd, BadEscape, BadNumber, TrailingGarbage }
|
||||||
|
|
||||||
|
// Lowercase/uppercase hex nibble value (0..15) of an ASCII byte; a non-hex
|
||||||
|
// byte in a `\uXXXX` escape is a `BadEscape`.
|
||||||
|
hex_value :: (c: u8) -> (s64, !JsonParseError) {
|
||||||
|
if c >= 48 and c <= 57 { return (cast(s64) c) - 48; } // '0'..'9'
|
||||||
|
if c >= 97 and c <= 102 { return (cast(s64) c) - 97 + 10; } // 'a'..'f'
|
||||||
|
if c >= 65 and c <= 70 { return (cast(s64) c) - 65 + 10; } // 'A'..'F'
|
||||||
|
raise error.BadEscape;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode code point `cp` (already validated 0..0x10FFFF, non-surrogate) as
|
||||||
|
// UTF-8 into `out`, returning the byte count (1..4). No bounds check: the
|
||||||
|
// decode buffer is sized to the raw escaped span, which always dominates.
|
||||||
|
encode_utf8 :: (cp: s64, out: [*]u8) -> s64 {
|
||||||
|
if cp < 0x80 {
|
||||||
|
out[0] = xx cp;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if cp < 0x800 {
|
||||||
|
out[0] = xx (0xC0 | (cp >> 6));
|
||||||
|
out[1] = xx (0x80 | (cp & 0x3F));
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if cp < 0x10000 {
|
||||||
|
out[0] = xx (0xE0 | (cp >> 12));
|
||||||
|
out[1] = xx (0x80 | ((cp >> 6) & 0x3F));
|
||||||
|
out[2] = xx (0x80 | (cp & 0x3F));
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
out[0] = xx (0xF0 | (cp >> 18));
|
||||||
|
out[1] = xx (0x80 | ((cp >> 12) & 0x3F));
|
||||||
|
out[2] = xx (0x80 | ((cp >> 6) & 0x3F));
|
||||||
|
out[3] = xx (0x80 | (cp & 0x3F));
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The cursor over the input. `src` is borrowed (never written); `pos` is
|
||||||
|
// the running offset and doubles as the failure position; `alloc` is the
|
||||||
|
// EXPLICIT allocator for composites + decoded strings.
|
||||||
|
Parser :: struct {
|
||||||
|
src: string;
|
||||||
|
pos: s64 = 0;
|
||||||
|
alloc: Allocator;
|
||||||
|
|
||||||
|
// Advance past JSON whitespace (space / tab / LF / CR).
|
||||||
|
skip_ws :: (self: *Parser) {
|
||||||
|
while self.pos < self.src.len {
|
||||||
|
c := self.src[self.pos];
|
||||||
|
if c == 32 or c == 9 or c == 10 or c == 13 { self.pos += 1; }
|
||||||
|
else { break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume an exact literal (`true` / `false` / `null`) or fail.
|
||||||
|
expect_lit :: (self: *Parser, lit: string) -> !JsonParseError {
|
||||||
|
if self.pos + lit.len > self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
i := 0;
|
||||||
|
while i < lit.len {
|
||||||
|
if self.src[self.pos + i] != lit[i] { raise error.UnexpectedToken; }
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
self.pos += lit.len;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read 4 hex digits at `i` (which must lie within [.., end)); returns
|
||||||
|
// the 16-bit value. Fewer than 4 digits before `end` is a BadEscape.
|
||||||
|
read_hex4 :: (self: *Parser, i: s64, end: s64) -> (s64, !JsonParseError) {
|
||||||
|
if i + 4 > end { raise error.BadEscape; }
|
||||||
|
v := 0;
|
||||||
|
k := 0;
|
||||||
|
while k < 4 {
|
||||||
|
v = v * 16 + (try hex_value(self.src[i + k]));
|
||||||
|
k += 1;
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the escaped string body in [start, end) into `out`, returning
|
||||||
|
// the decoded byte length. Pass 1 (in parse_string) guarantees there is
|
||||||
|
// no dangling backslash, so the byte after every `\` is in range.
|
||||||
|
decode_into :: (self: *Parser, start: s64, end: s64, out: [*]u8) -> (s64, !JsonParseError) {
|
||||||
|
di := 0;
|
||||||
|
i := start;
|
||||||
|
while i < end {
|
||||||
|
c := self.src[i];
|
||||||
|
if c == 92 { // backslash
|
||||||
|
i += 1;
|
||||||
|
e := self.src[i];
|
||||||
|
if e == 34 { out[di] = 34; di += 1; i += 1; } // \"
|
||||||
|
else if e == 92 { out[di] = 92; di += 1; i += 1; } // \\
|
||||||
|
else if e == 47 { out[di] = 47; di += 1; i += 1; } // \/
|
||||||
|
else if e == 98 { out[di] = 8; di += 1; i += 1; } // \b
|
||||||
|
else if e == 102 { out[di] = 12; di += 1; i += 1; } // \f
|
||||||
|
else if e == 110 { out[di] = 10; di += 1; i += 1; } // \n
|
||||||
|
else if e == 114 { out[di] = 13; di += 1; i += 1; } // \r
|
||||||
|
else if e == 116 { out[di] = 9; di += 1; i += 1; } // \t
|
||||||
|
else if e == 117 { // \uXXXX
|
||||||
|
hpos := i + 1;
|
||||||
|
u := try self.read_hex4(hpos, end);
|
||||||
|
if u >= 0xD800 and u <= 0xDBFF {
|
||||||
|
// high surrogate: require a following \uYYYY low surrogate
|
||||||
|
lpos := hpos + 4;
|
||||||
|
if lpos + 2 > end { raise error.BadEscape; }
|
||||||
|
if self.src[lpos] != 92 or self.src[lpos + 1] != 117 { raise error.BadEscape; }
|
||||||
|
lo := try self.read_hex4(lpos + 2, end);
|
||||||
|
if lo < 0xDC00 or lo > 0xDFFF { raise error.BadEscape; }
|
||||||
|
cp := 0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00);
|
||||||
|
di += encode_utf8(cp, @out[di]);
|
||||||
|
i = lpos + 6;
|
||||||
|
} else {
|
||||||
|
if u >= 0xDC00 and u <= 0xDFFF { raise error.BadEscape; } // lone low surrogate
|
||||||
|
di += encode_utf8(u, @out[di]);
|
||||||
|
i = hpos + 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { raise error.BadEscape; }
|
||||||
|
} else {
|
||||||
|
out[di] = c; di += 1; i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return di;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse a string starting at the opening quote (current `pos`). Returns
|
||||||
|
// a zero-copy VIEW into `src` when the body has no escapes; otherwise
|
||||||
|
// decodes into an `alloc`-ed buffer (bounded by the raw span). `pos`
|
||||||
|
// ends just past the closing quote.
|
||||||
|
parse_string :: (self: *Parser) -> (string, !JsonParseError) {
|
||||||
|
self.pos += 1; // consume opening quote
|
||||||
|
start := self.pos;
|
||||||
|
has_escape := false;
|
||||||
|
i := start;
|
||||||
|
while i < self.src.len {
|
||||||
|
c := self.src[i];
|
||||||
|
if c == 34 { break; } // closing quote
|
||||||
|
if c == 92 { // backslash escapes the next byte
|
||||||
|
has_escape = true;
|
||||||
|
i += 1;
|
||||||
|
if i >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
if i >= self.src.len { raise error.UnexpectedEnd; } // unterminated
|
||||||
|
end := i;
|
||||||
|
if !has_escape {
|
||||||
|
self.pos = end + 1;
|
||||||
|
return string.{ ptr = @self.src[start], len = end - start };
|
||||||
|
}
|
||||||
|
raw_len := end - start; // decoded length <= raw_len (escapes shrink)
|
||||||
|
out : [*]u8 = xx self.alloc.alloc(raw_len);
|
||||||
|
dlen := try self.decode_into(start, end, out);
|
||||||
|
self.pos = end + 1;
|
||||||
|
return string.{ ptr = out, len = dlen };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse an s64 integer (optional '-', then digits). Rejects leading
|
||||||
|
// zeros, a fraction/exponent tail, and any value outside s64 — all
|
||||||
|
// `BadNumber`. Accumulates in NEGATIVE space so s64 MIN parses exactly.
|
||||||
|
parse_number :: (self: *Parser) -> (s64, !JsonParseError) {
|
||||||
|
// s64 bounds, built positionally because |MIN| is not a
|
||||||
|
// representable positive s64 literal. `min_div10` is `MIN / 10`
|
||||||
|
// truncated toward zero (remainder -8) — the digit loop's overflow
|
||||||
|
// threshold. Accumulation runs in NEGATIVE space so MIN is exact.
|
||||||
|
s64_min := 0 - 9223372036854775807 - 1;
|
||||||
|
min_div10 := 0 - 922337203685477580;
|
||||||
|
neg := false;
|
||||||
|
if self.src[self.pos] == 45 { neg = true; self.pos += 1; } // '-'
|
||||||
|
if self.pos >= self.src.len { raise error.BadNumber; } // '-' with no digit
|
||||||
|
dstart := self.pos;
|
||||||
|
c0 := self.src[self.pos];
|
||||||
|
if c0 < 48 or c0 > 57 { raise error.BadNumber; }
|
||||||
|
val : s64 = 0;
|
||||||
|
digits := 0;
|
||||||
|
while self.pos < self.src.len {
|
||||||
|
c := self.src[self.pos];
|
||||||
|
if c < 48 or c > 57 { break; }
|
||||||
|
d := (cast(s64) c) - 48;
|
||||||
|
if val < min_div10 { raise error.BadNumber; }
|
||||||
|
if val == min_div10 and d > 8 { raise error.BadNumber; }
|
||||||
|
val = val * 10 - d;
|
||||||
|
digits += 1;
|
||||||
|
self.pos += 1;
|
||||||
|
}
|
||||||
|
if self.src[dstart] == 48 and digits > 1 { raise error.BadNumber; } // no leading zeros
|
||||||
|
if self.pos < self.src.len {
|
||||||
|
nc := self.src[self.pos];
|
||||||
|
if nc == 46 or nc == 101 or nc == 69 { raise error.BadNumber; } // '.' / 'e' / 'E' — ints only
|
||||||
|
}
|
||||||
|
if !neg {
|
||||||
|
if val == s64_min { raise error.BadNumber; } // |MIN| not representable as +s64
|
||||||
|
val = 0 - val;
|
||||||
|
}
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse an array starting at '['. Builds an `Array` through `alloc`.
|
||||||
|
parse_array :: (self: *Parser) -> (Value, !JsonParseError) {
|
||||||
|
self.pos += 1; // consume '['
|
||||||
|
arr : Array = .{};
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos < self.src.len and self.src[self.pos] == 93 { // empty ']'
|
||||||
|
self.pos += 1;
|
||||||
|
return Value.array(arr);
|
||||||
|
}
|
||||||
|
loop := true;
|
||||||
|
while loop {
|
||||||
|
v := try self.parse_value();
|
||||||
|
arr.add(v, self.alloc);
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
c := self.src[self.pos];
|
||||||
|
if c == 44 { self.pos += 1; } // ',' more
|
||||||
|
else if c == 93 { self.pos += 1; loop = false; } // ']' done
|
||||||
|
else { raise error.UnexpectedToken; }
|
||||||
|
}
|
||||||
|
return Value.array(arr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse an object starting at '{'. Keys must be strings; insertion
|
||||||
|
// order is preserved (duplicate keys are kept, never merged).
|
||||||
|
parse_object :: (self: *Parser) -> (Value, !JsonParseError) {
|
||||||
|
self.pos += 1; // consume '{'
|
||||||
|
obj : Object = .{};
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos < self.src.len and self.src[self.pos] == 125 { // empty '}'
|
||||||
|
self.pos += 1;
|
||||||
|
return Value.object(obj);
|
||||||
|
}
|
||||||
|
loop := true;
|
||||||
|
while loop {
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
if self.src[self.pos] != 34 { raise error.UnexpectedToken; } // key must be a string
|
||||||
|
key := try self.parse_string();
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
if self.src[self.pos] != 58 { raise error.UnexpectedToken; } // ':'
|
||||||
|
self.pos += 1;
|
||||||
|
v := try self.parse_value();
|
||||||
|
obj.put(key, v, self.alloc);
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
c := self.src[self.pos];
|
||||||
|
if c == 44 { self.pos += 1; } // ',' more
|
||||||
|
else if c == 125 { self.pos += 1; loop = false; } // '}' done
|
||||||
|
else { raise error.UnexpectedToken; }
|
||||||
|
}
|
||||||
|
return Value.object(obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse any single value (after skipping leading whitespace).
|
||||||
|
parse_value :: (self: *Parser) -> (Value, !JsonParseError) {
|
||||||
|
self.skip_ws();
|
||||||
|
if self.pos >= self.src.len { raise error.UnexpectedEnd; }
|
||||||
|
c := self.src[self.pos];
|
||||||
|
if c == 123 { return try self.parse_object(); } // '{'
|
||||||
|
if c == 91 { return try self.parse_array(); } // '['
|
||||||
|
if c == 34 { s := try self.parse_string(); return Value.str(s); } // '"'
|
||||||
|
if c == 116 { try self.expect_lit("true"); return Value.bool_(true); } // 't'
|
||||||
|
if c == 102 { try self.expect_lit("false"); return Value.bool_(false); } // 'f'
|
||||||
|
if c == 110 { try self.expect_lit("null"); nv : Value = .null_; return nv; } // 'n'
|
||||||
|
if c == 45 or (c >= 48 and c <= 57) { n := try self.parse_number(); return Value.int_(n); } // '-' / digit
|
||||||
|
raise error.UnexpectedToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse a complete JSON document from `src` into the value model, using
|
||||||
|
// `alloc` for composite nodes and decoded (escaped) strings. Un-escaped
|
||||||
|
// string values are VIEWS into `src` and are valid only while `src` lives.
|
||||||
|
// Trailing non-whitespace after the value raises `error.TrailingGarbage`.
|
||||||
|
parse :: (src: string, alloc: Allocator) -> (Value, !JsonParseError) {
|
||||||
|
p := Parser.{ src = src, alloc = alloc };
|
||||||
|
v := try p.parse_value();
|
||||||
|
p.skip_ws();
|
||||||
|
if p.pos != p.src.len { raise error.TrailingGarbage; }
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user