From 88be54177825e41682bad113bbfbd51cb236e5bc Mon Sep 17 00:00:00 2001 From: agra Date: Thu, 4 Jun 2026 01:41:33 +0300 Subject: [PATCH] =?UTF-8?q?F2.2:=20std/json=20reader=20=E2=80=94=20explici?= =?UTF-8?q?t-alloc=20parse=20with=20error=20surfacing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the JSON reader (parser) to library/modules/std/json.sx, the inverse of the F2.1 writer over the same value model: insertion-ordered objects, arrays, strings (full unescaping incl. \uXXXX + surrogate pairs), s64 integers, bool, null. Heap discipline (binding): exactly two allocation kinds, both through the EXPLICIT `alloc` parameter, never the implicit context allocator — composite backing stores (Array/Object.items via add/put) and decoded escaped-string buffers (bounded by the raw span). Un-escaped string values are zero-copy VIEWS into the input buffer (valid only while it lives); scalars carry no heap. Failure surfacing (hard contract): malformed input raises a meaningful JsonParseError variant (UnexpectedToken / UnexpectedEnd / BadEscape / BadNumber / TrailingGarbage) on the error channel, never a bogus value. Trailing non-whitespace is TrailingGarbage; fractions/exponents, out-of-s64 magnitudes, and leading zeros are BadNumber. Number accumulation runs in negative space so s64 MIN parses exactly. examples/0714-modules-json-reader.sx asserts the parsed structure (insertion order, every kind), proves the view-vs-decoded heap split by pointer containment, round-trips back through the writer byte-for-byte, decodes a surrogate-pair into 4 UTF-8 bytes, and checks every malformed variant. Filed issues/0078: a string `==` (or any sub-CFG operand) used in a short-circuit `and`/`or` emits invalid LLVM IR (stale PHI predecessor), hit while writing the example's assertions and worked around there by not combining comparisons with `and`/`or`. src/ untouched. --- examples/0714-modules-json-reader.sx | 137 ++++++++ .../expected/0714-modules-json-reader.exit | 1 + .../expected/0714-modules-json-reader.stderr | 0 .../expected/0714-modules-json-reader.stdout | 37 ++ ...perand-of-short-circuit-and-invalid-phi.md | 113 ++++++ library/modules/std/json.sx | 331 +++++++++++++++++- 6 files changed, 615 insertions(+), 4 deletions(-) create mode 100644 examples/0714-modules-json-reader.sx create mode 100644 examples/expected/0714-modules-json-reader.exit create mode 100644 examples/expected/0714-modules-json-reader.stderr create mode 100644 examples/expected/0714-modules-json-reader.stdout create mode 100644 issues/0078-string-eq-operand-of-short-circuit-and-invalid-phi.md diff --git a/examples/0714-modules-json-reader.sx b/examples/0714-modules-json-reader.sx new file mode 100644 index 0000000..2197638 --- /dev/null +++ b/examples/0714-modules-json-reader.sx @@ -0,0 +1,137 @@ +// JSON reader (parser) from `modules/std/json.sx` — the inverse of the +// F2.1 writer. +// +// Parses a representative document (nested object + array + a +// string-with-escapes + ints incl. negatives + bool + null) into the +// shared value model, then proves: +// +// 1. STRUCTURE — the parsed tree has the expected keys (in INSERTION +// order), values, and nesting. +// 2. HEAP DISCIPLINE — an un-escaped string value is a zero-copy VIEW +// into the input buffer (its bytes lie inside `src`), while an +// escaped string is DECODED into a fresh `alloc`-ed buffer (its +// bytes lie OUTSIDE `src`). Composite nodes + the decoded string are +// the only allocations, all through the explicit Arena. +// 3. ROUND-TRIP — feeding the parsed tree back to the writer reproduces +// the canonical input byte-for-byte. +// 4. UNICODE — `\uXXXX` (BMP + 2-byte) and a surrogate pair decode to +// the right UTF-8 bytes. +// 5. FAILURE SURFACING — every malformed input raises the right +// `JsonParseError` variant on the error channel, never a bogus value. + +#import "modules/std.sx"; +#import "modules/std/json.sx"; + +// Canonical document: no insignificant whitespace, escapes in the writer's +// own form — so re-serializing the parse must reproduce it exactly. +DOC :: "{\"name\":\"plain\",\"esc\":\"a\\nb\",\"xs\":[10,-20],\"yes\":true,\"nil\":null,\"sub\":{\"k\":\"v\"}}"; + +report :: (label: string, ok: bool) { + if ok { print("{}: ok\n", label); } else { print("{}: FAIL\n", label); } +} + +// Half-open containment [lo, hi). Written with early returns (no `and`) so +// the assertions below never combine comparisons with short-circuit +// `and`/`or` — see issues/0078. +in_range :: (x: s64, lo: s64, hi: s64) -> bool { + if x < lo { return false; } + if x >= hi { return false; } + return true; +} + +// True when `parse(src)` raised `want` — destructure captures the error +// tag without `try`, so a malformed input never aborts the example. +raises :: (src: string, want: JsonParseError, alloc: Allocator) -> bool { + _, e := parse(src, alloc); + e == want +} + +main :: () -> ! { + gpa := GPA.init(); + arena := Arena.init(xx gpa, 8192); + defer arena.deinit(); + + // ── 1. Structure ───────────────────────────────────────────────── + src := DOC; + root := try parse(src, xx arena); + + is_object := if root == { case .object: true; else: false; }; + report("root-is-object", is_object); + + o := root.object; + report("member-count", o.len == 6); + report("key-order-0", o.items[0].key == "name"); + report("string-plain", o.items[0].val.str == "plain"); + report("string-escaped", o.items[1].val.str == "a\nb"); // \n decoded to 0x0A + + xs := o.items[2].val.array; + report("array-len", xs.len == 2); + report("array-pos", xs.items[0].int_ == 10); + report("array-neg", xs.items[1].int_ == 0 - 20); + + report("bool-value", o.items[3].val.bool_ == true); + + is_null := if o.items[4].val == { case .null_: true; else: false; }; + report("null-value", is_null); + + // Two separate reports (not `key=="k" and val=="v"`): a string `==` + // as an operand of short-circuit `and`/`or` miscompiles — see + // issues/0078. Every assertion here is therefore a single comparison. + sub := o.items[5].val.object; + report("nested-key", sub.items[0].key == "k"); + report("nested-val", sub.items[0].val.str == "v"); + + // ── 2. Heap discipline: view vs decoded ────────────────────────── + base : s64 = xx src.ptr; + stop := base + src.len; + p_plain : s64 = xx o.items[0].val.str.ptr; // "plain": no escape -> VIEW into src + p_esc : s64 = xx o.items[1].val.str.ptr; // "a\nb": escaped -> DECODED into arena + report("plain-is-view", in_range(p_plain, base, stop)); + report("escaped-allocated", !in_range(p_esc, base, stop)); + + // ── 3. Round-trip back through the writer ──────────────────────── + buf : [256]u8 = ---; + n := try write_to_buffer(root, string.{ ptr = @buf[0], len = 256 }); + rt := string.{ ptr = @buf[0], len = n }; + report("round-trip", rt == src); + + // ── 4. Leading/trailing/inner whitespace is insignificant ──────── + // Each comparison is its own report (no `and`-combining — issues/0078). + wsv := try parse(" [ 1 , 2 , 3 ] ", xx arena); + wa := wsv.array; + report("ws-count", wa.len == 3); + report("ws-first", wa.items[0].int_ == 1); + report("ws-last", wa.items[2].int_ == 3); + + // Empty container literals (the manifest/db.json use these). + ea := try parse("[]", xx arena); + report("empty-array", ea.array.len == 0); + eo := try parse("{}", xx arena); + report("empty-object", eo.object.len == 0); + + // ── 5. Unicode: \uXXXX (1- and 2-byte) + surrogate pair (4-byte) ── + // JSON "Aé😀" -> 'A', 'é' (C3 A9), '😀' (F0 9F 98 80). One byte per report. + univ := try parse("\"\\u0041\\u00e9\\uD83D\\uDE00\"", xx arena); + u := univ.str; + report("uni-len", u.len == 7); + report("uni-A", u[0] == 0x41); // U+0041 -> 1 byte + report("uni-e1", u[1] == 0xC3); // U+00E9 -> 2 bytes + report("uni-e2", u[2] == 0xA9); + report("uni-s0", u[3] == 0xF0); // U+1F600 (surrogate pair) -> 4 bytes + report("uni-s1", u[4] == 0x9F); + report("uni-s2", u[5] == 0x98); + report("uni-s3", u[6] == 0x80); + + // ── 6. Malformed inputs each surface the right error variant ───── + report("err-truncated", raises("{\"a\":", error.UnexpectedEnd, xx arena)); + report("err-bad-escape", raises("\"a\\xb\"", error.BadEscape, xx arena)); + report("err-trailing-junk", raises("[1,2] x", error.TrailingGarbage, xx arena)); + report("err-bad-token", raises("xyz", error.UnexpectedToken, xx arena)); + report("err-fraction", raises("1.5", error.BadNumber, xx arena)); + report("err-leading-zero", raises("01", error.BadNumber, xx arena)); + report("err-overflow", raises("9223372036854775808", error.BadNumber, xx arena)); + report("err-unterminated", raises("\"abc", error.UnexpectedEnd, xx arena)); + + print("=== DONE ===\n"); + return; +} diff --git a/examples/expected/0714-modules-json-reader.exit b/examples/expected/0714-modules-json-reader.exit new file mode 100644 index 0000000..573541a --- /dev/null +++ b/examples/expected/0714-modules-json-reader.exit @@ -0,0 +1 @@ +0 diff --git a/examples/expected/0714-modules-json-reader.stderr b/examples/expected/0714-modules-json-reader.stderr new file mode 100644 index 0000000..e69de29 diff --git a/examples/expected/0714-modules-json-reader.stdout b/examples/expected/0714-modules-json-reader.stdout new file mode 100644 index 0000000..796b0a9 --- /dev/null +++ b/examples/expected/0714-modules-json-reader.stdout @@ -0,0 +1,37 @@ +root-is-object: ok +member-count: ok +key-order-0: ok +string-plain: ok +string-escaped: ok +array-len: ok +array-pos: ok +array-neg: ok +bool-value: ok +null-value: ok +nested-key: ok +nested-val: ok +plain-is-view: ok +escaped-allocated: ok +round-trip: ok +ws-count: ok +ws-first: ok +ws-last: ok +empty-array: ok +empty-object: ok +uni-len: ok +uni-A: ok +uni-e1: ok +uni-e2: ok +uni-s0: ok +uni-s1: ok +uni-s2: ok +uni-s3: ok +err-truncated: ok +err-bad-escape: ok +err-trailing-junk: ok +err-bad-token: ok +err-fraction: ok +err-leading-zero: ok +err-overflow: ok +err-unterminated: ok +=== DONE === diff --git a/issues/0078-string-eq-operand-of-short-circuit-and-invalid-phi.md b/issues/0078-string-eq-operand-of-short-circuit-and-invalid-phi.md new file mode 100644 index 0000000..e5139cf --- /dev/null +++ b/issues/0078-string-eq-operand-of-short-circuit-and-invalid-phi.md @@ -0,0 +1,113 @@ +# Symptom + +A string equality (`a == "x"`) used as an operand of a short-circuit +`and` / `or` emits LLVM IR that fails verification — the JIT (`sx run`) +and AOT paths both abort before running: + +``` +LLVM verification failed: PHI node entries do not match predecessors! + %bp = phi i1 [ false, %entry ], [ %str.eq10, %and.rhs.0 ] +label %entry +label %str.merge +Instruction does not dominate all uses! + %str.eq10 = phi i1 [ false, %and.rhs.0 ], [ %str.ceq9, %str.memcmp6 ] + %bp = phi i1 [ false, %entry ], [ %str.eq10, %and.rhs.0 ] +``` + +Integer/`error`-tag equality in the same position is fine — only the +string `==` operand miscompiles, because string `==` lowers to its own +multi-block memcmp with an internal PHI (`str.eq` ← {`str.memcmp`, +short-circuit false}). When that result is then consumed by the `and`/`or` +short-circuit merge, the predecessor set the outer PHI records does not +match the actual CFG: the string-compare's merge block becomes a +predecessor of the `and` merge, but the outer PHI still lists the original +`entry`/`and.rhs` edges. The inner `str.eq` PHI also ends up referenced +from a block it does not dominate. + +# Reproduction + +```sx +#import "modules/std.sx"; +main :: () { + a := "k"; + b := "v"; + r := a == "k" and b == "v"; // string == as an `and` operand + print("{}\n", r); +} +``` + +``` +$ ./zig-out/bin/sx run repro.sx +LLVM verification failed: PHI node entries do not match predecessors! +... +``` + +`a == "k" or b == "v"` reproduces it identically (`or.rhs` in place of +`and.rhs`). A single `a == "k"` (no `and`/`or`) compiles and runs fine, as +does `x == 1 and y == 2` (integer operands). So the trigger is specifically +a **string `==`/`!=` as an operand of a short-circuit `and`/`or`** — the +operand emits its own `str.memcmp`/`str.merge` sub-CFG, and the +short-circuit PHI then records a stale predecessor block. + +A related `match.merge`-predecessor variant of the same PHI mismatch also +appears in a LARGER function that mixes several enum-payload accesses +(`v.str`/`v.int_`) and `match` expressions with multiple `and`/`or` +operations (it surfaced while writing +`examples/0714-modules-json-reader.sx`). It did NOT reduce to a small +standalone repro — each construct compiles fine in isolation, and a single +payload-access operand (`true and e.a == 1`) or a preceding `match` +expression followed by an `and` of locals both compile — which points at +cumulative basic-block bookkeeping in the `and`/`or` lowering rather than a +single local pattern. The string-`==` case above is the reliable minimal +reproduction; the broader fix should address PHI predecessor tracking for +any `and`/`or` operand that emits intermediate basic blocks. + +# Expected + +`r` should be `true` (both compares hold) and the program print `true`. +Generally: a `string ==`/`!=` result must be usable as an operand of +`and`/`or` exactly like any other `bool`. + +# Workaround (until fixed) + +Don't combine string equality with `and`/`or` in one expression; split +into separate statements / separate boolean locals: + +```sx +ok_k := a == "k"; +ok_v := b == "v"; +r := ok_k and ok_v; // each string-eq materialized before the short-circuit +``` + +# Background / where to look + +The string `==` lowering (search `str.eq` / `str.memcmp` / `str.merge` +block names in `src/ir/lower.zig`) produces a value via a PHI that joins +the memcmp-equal block and the early-out (length-mismatch / short-circuit) +block. The boolean `and`/`or` lowering builds its own `and.rhs` / +`and.merge` (resp. `or.*`) blocks and a merge PHI. When the LHS (or RHS) +of the `and`/`or` is itself a string compare, the outer short-circuit +lowering must take the string-compare's *actual current block* (its merge +block) as the incoming predecessor for the outer PHI — not the block that +was current before the string compare emitted its sub-CFG. The mismatch +above is the classic "PHI incoming-block is stale after the operand +emitted new basic blocks" bug: the fix is to re-read the builder's current +insertion block when wiring the `and`/`or` PHI incoming edges, rather than +caching it before lowering the operand. This mirrors the shape of the +match-arm PHI fix in issue 0066. + +Discovered while writing the std.json reader regression example +(`examples/0714-modules-json-reader.sx`, flow step F2.2): an assertion +`key == "k" and val.str == "v"` triggered it. The reader library code +itself does not use this pattern; the example was rewritten to assert the +two string equalities separately. + +# Verification (once fixed) + +```sh +./zig-out/bin/sx run repro.sx # prints: true +``` + +Add a regression example (next free `examples/NNNN-*.sx` slot) that uses a +string `==` on both sides of an `and` and on both sides of an `or`, and +the full suite + `zig build test` must stay green. diff --git a/library/modules/std/json.sx b/library/modules/std/json.sx index f43c27c..92667f1 100644 --- a/library/modules/std/json.sx +++ b/library/modules/std/json.sx @@ -1,12 +1,13 @@ // ===================================================================== -// json.sx — JSON value model + writer (stable key order), pure sx. +// json.sx — JSON value model + writer + reader (stable key order), pure sx. // -// This module delivers the JSON VALUE MODEL and the WRITER. The reader -// (parser) lands separately; this file never reads JSON text. +// This module delivers the JSON VALUE MODEL, the WRITER, and the READER +// (parser). The model is built once and shared by both directions. // // NUMBERS ARE INTEGERS ONLY (s64) for this milestone — there is no // fraction or exponent. A JSON value is one of: null, bool, integer, -// string, array, object. +// string, array, object. The reader REJECTS a fraction or exponent +// (`error.BadNumber`) rather than silently truncating it. // // STABLE KEY ORDER: an object is NOT a hash map. It is an ORDERED list // of (key, value) pairs that preserves INSERTION ORDER. Keys are never @@ -333,3 +334,325 @@ write_to_file :: (v: Value, file: *File, staging: []u8) -> !JsonError { try sink.flush(); return; } + +// ── Reader (parser) ─────────────────────────────────────────────────── +// +// `parse(src, alloc)` turns a JSON document in `src` into the value model +// above. It is the inverse of the writer for the v0 scope: objects (in +// INSERTION ORDER), arrays, strings (with full unescaping incl. \uXXXX +// and surrogate pairs), s64 integers, bool, null. +// +// FAILURE SURFACING (hard contract): every malformed input raises on the +// error channel (`!JsonParseError`) — never a bogus or default value. +// Trailing non-whitespace after a complete value is `TrailingGarbage`. +// `pos` (the parser cursor) marks where the failure was detected. +// +// NOT SUPPORTED (rejected, not silently accepted): a fraction or exponent +// in a number (`1.5`, `1e9`) → `BadNumber`; a number outside s64 → +// `BadNumber`; a leading-zero integer (`01`) → `BadNumber`. UNESCAPED raw +// control bytes (< 0x20) inside a string are passed through verbatim (the +// minimal-reader leniency the manifest / db.json never exercise). +// +// HEAP DISCIPLINE (binding, see heap-discipline.md). Exactly two kinds of +// allocation happen, both through the EXPLICIT `alloc` parameter, never +// the implicit context allocator: +// 1. Composite backing stores — `Array.items` / `Object.items` grow via +// `arr.add(.., alloc)` / `obj.put(.., alloc)` (genuinely unbounded +// children; mirrors `List`). +// 2. DECODED strings — a string containing escapes must be un-escaped +// into fresh storage; that buffer is `alloc`-ed (bounded by the raw +// span, since every escape shrinks). A string with NO escapes is a +// zero-copy VIEW into `src`; scalars carry no heap. +// +// OWNERSHIP / LIFETIME: un-escaped string values are SLICES into `src` — +// they are valid only while `src` lives. Everything else (nodes, decoded +// strings) is owned by `alloc`; free it all by dropping that allocator +// (e.g. an Arena `deinit`). A typical caller parses under an Arena and +// keeps `src` alive for as long as the tree is used. +// +// gpa := GPA.init(); +// arena := Arena.init(xx gpa, 4096); +// defer arena.deinit(); +// root := parse(src, xx arena)!; // composites + decoded strings in arena + +// The reader's failure contract. Meaningful variants so a caller can tell +// a truncated document from a bad escape from trailing junk. +JsonParseError :: error { UnexpectedToken, UnexpectedEnd, BadEscape, BadNumber, TrailingGarbage } + +// Lowercase/uppercase hex nibble value (0..15) of an ASCII byte; a non-hex +// byte in a `\uXXXX` escape is a `BadEscape`. +hex_value :: (c: u8) -> (s64, !JsonParseError) { + if c >= 48 and c <= 57 { return (cast(s64) c) - 48; } // '0'..'9' + if c >= 97 and c <= 102 { return (cast(s64) c) - 97 + 10; } // 'a'..'f' + if c >= 65 and c <= 70 { return (cast(s64) c) - 65 + 10; } // 'A'..'F' + raise error.BadEscape; +} + +// Encode code point `cp` (already validated 0..0x10FFFF, non-surrogate) as +// UTF-8 into `out`, returning the byte count (1..4). No bounds check: the +// decode buffer is sized to the raw escaped span, which always dominates. +encode_utf8 :: (cp: s64, out: [*]u8) -> s64 { + if cp < 0x80 { + out[0] = xx cp; + return 1; + } + if cp < 0x800 { + out[0] = xx (0xC0 | (cp >> 6)); + out[1] = xx (0x80 | (cp & 0x3F)); + return 2; + } + if cp < 0x10000 { + out[0] = xx (0xE0 | (cp >> 12)); + out[1] = xx (0x80 | ((cp >> 6) & 0x3F)); + out[2] = xx (0x80 | (cp & 0x3F)); + return 3; + } + out[0] = xx (0xF0 | (cp >> 18)); + out[1] = xx (0x80 | ((cp >> 12) & 0x3F)); + out[2] = xx (0x80 | ((cp >> 6) & 0x3F)); + out[3] = xx (0x80 | (cp & 0x3F)); + return 4; +} + +// The cursor over the input. `src` is borrowed (never written); `pos` is +// the running offset and doubles as the failure position; `alloc` is the +// EXPLICIT allocator for composites + decoded strings. +Parser :: struct { + src: string; + pos: s64 = 0; + alloc: Allocator; + + // Advance past JSON whitespace (space / tab / LF / CR). + skip_ws :: (self: *Parser) { + while self.pos < self.src.len { + c := self.src[self.pos]; + if c == 32 or c == 9 or c == 10 or c == 13 { self.pos += 1; } + else { break; } + } + } + + // Consume an exact literal (`true` / `false` / `null`) or fail. + expect_lit :: (self: *Parser, lit: string) -> !JsonParseError { + if self.pos + lit.len > self.src.len { raise error.UnexpectedEnd; } + i := 0; + while i < lit.len { + if self.src[self.pos + i] != lit[i] { raise error.UnexpectedToken; } + i += 1; + } + self.pos += lit.len; + return; + } + + // Read 4 hex digits at `i` (which must lie within [.., end)); returns + // the 16-bit value. Fewer than 4 digits before `end` is a BadEscape. + read_hex4 :: (self: *Parser, i: s64, end: s64) -> (s64, !JsonParseError) { + if i + 4 > end { raise error.BadEscape; } + v := 0; + k := 0; + while k < 4 { + v = v * 16 + (try hex_value(self.src[i + k])); + k += 1; + } + return v; + } + + // Decode the escaped string body in [start, end) into `out`, returning + // the decoded byte length. Pass 1 (in parse_string) guarantees there is + // no dangling backslash, so the byte after every `\` is in range. + decode_into :: (self: *Parser, start: s64, end: s64, out: [*]u8) -> (s64, !JsonParseError) { + di := 0; + i := start; + while i < end { + c := self.src[i]; + if c == 92 { // backslash + i += 1; + e := self.src[i]; + if e == 34 { out[di] = 34; di += 1; i += 1; } // \" + else if e == 92 { out[di] = 92; di += 1; i += 1; } // \\ + else if e == 47 { out[di] = 47; di += 1; i += 1; } // \/ + else if e == 98 { out[di] = 8; di += 1; i += 1; } // \b + else if e == 102 { out[di] = 12; di += 1; i += 1; } // \f + else if e == 110 { out[di] = 10; di += 1; i += 1; } // \n + else if e == 114 { out[di] = 13; di += 1; i += 1; } // \r + else if e == 116 { out[di] = 9; di += 1; i += 1; } // \t + else if e == 117 { // \uXXXX + hpos := i + 1; + u := try self.read_hex4(hpos, end); + if u >= 0xD800 and u <= 0xDBFF { + // high surrogate: require a following \uYYYY low surrogate + lpos := hpos + 4; + if lpos + 2 > end { raise error.BadEscape; } + if self.src[lpos] != 92 or self.src[lpos + 1] != 117 { raise error.BadEscape; } + lo := try self.read_hex4(lpos + 2, end); + if lo < 0xDC00 or lo > 0xDFFF { raise error.BadEscape; } + cp := 0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00); + di += encode_utf8(cp, @out[di]); + i = lpos + 6; + } else { + if u >= 0xDC00 and u <= 0xDFFF { raise error.BadEscape; } // lone low surrogate + di += encode_utf8(u, @out[di]); + i = hpos + 4; + } + } + else { raise error.BadEscape; } + } else { + out[di] = c; di += 1; i += 1; + } + } + return di; + } + + // Parse a string starting at the opening quote (current `pos`). Returns + // a zero-copy VIEW into `src` when the body has no escapes; otherwise + // decodes into an `alloc`-ed buffer (bounded by the raw span). `pos` + // ends just past the closing quote. + parse_string :: (self: *Parser) -> (string, !JsonParseError) { + self.pos += 1; // consume opening quote + start := self.pos; + has_escape := false; + i := start; + while i < self.src.len { + c := self.src[i]; + if c == 34 { break; } // closing quote + if c == 92 { // backslash escapes the next byte + has_escape = true; + i += 1; + if i >= self.src.len { raise error.UnexpectedEnd; } + } + i += 1; + } + if i >= self.src.len { raise error.UnexpectedEnd; } // unterminated + end := i; + if !has_escape { + self.pos = end + 1; + return string.{ ptr = @self.src[start], len = end - start }; + } + raw_len := end - start; // decoded length <= raw_len (escapes shrink) + out : [*]u8 = xx self.alloc.alloc(raw_len); + dlen := try self.decode_into(start, end, out); + self.pos = end + 1; + return string.{ ptr = out, len = dlen }; + } + + // Parse an s64 integer (optional '-', then digits). Rejects leading + // zeros, a fraction/exponent tail, and any value outside s64 — all + // `BadNumber`. Accumulates in NEGATIVE space so s64 MIN parses exactly. + parse_number :: (self: *Parser) -> (s64, !JsonParseError) { + // s64 bounds, built positionally because |MIN| is not a + // representable positive s64 literal. `min_div10` is `MIN / 10` + // truncated toward zero (remainder -8) — the digit loop's overflow + // threshold. Accumulation runs in NEGATIVE space so MIN is exact. + s64_min := 0 - 9223372036854775807 - 1; + min_div10 := 0 - 922337203685477580; + neg := false; + if self.src[self.pos] == 45 { neg = true; self.pos += 1; } // '-' + if self.pos >= self.src.len { raise error.BadNumber; } // '-' with no digit + dstart := self.pos; + c0 := self.src[self.pos]; + if c0 < 48 or c0 > 57 { raise error.BadNumber; } + val : s64 = 0; + digits := 0; + while self.pos < self.src.len { + c := self.src[self.pos]; + if c < 48 or c > 57 { break; } + d := (cast(s64) c) - 48; + if val < min_div10 { raise error.BadNumber; } + if val == min_div10 and d > 8 { raise error.BadNumber; } + val = val * 10 - d; + digits += 1; + self.pos += 1; + } + if self.src[dstart] == 48 and digits > 1 { raise error.BadNumber; } // no leading zeros + if self.pos < self.src.len { + nc := self.src[self.pos]; + if nc == 46 or nc == 101 or nc == 69 { raise error.BadNumber; } // '.' / 'e' / 'E' — ints only + } + if !neg { + if val == s64_min { raise error.BadNumber; } // |MIN| not representable as +s64 + val = 0 - val; + } + return val; + } + + // Parse an array starting at '['. Builds an `Array` through `alloc`. + parse_array :: (self: *Parser) -> (Value, !JsonParseError) { + self.pos += 1; // consume '[' + arr : Array = .{}; + self.skip_ws(); + if self.pos < self.src.len and self.src[self.pos] == 93 { // empty ']' + self.pos += 1; + return Value.array(arr); + } + loop := true; + while loop { + v := try self.parse_value(); + arr.add(v, self.alloc); + self.skip_ws(); + if self.pos >= self.src.len { raise error.UnexpectedEnd; } + c := self.src[self.pos]; + if c == 44 { self.pos += 1; } // ',' more + else if c == 93 { self.pos += 1; loop = false; } // ']' done + else { raise error.UnexpectedToken; } + } + return Value.array(arr); + } + + // Parse an object starting at '{'. Keys must be strings; insertion + // order is preserved (duplicate keys are kept, never merged). + parse_object :: (self: *Parser) -> (Value, !JsonParseError) { + self.pos += 1; // consume '{' + obj : Object = .{}; + self.skip_ws(); + if self.pos < self.src.len and self.src[self.pos] == 125 { // empty '}' + self.pos += 1; + return Value.object(obj); + } + loop := true; + while loop { + self.skip_ws(); + if self.pos >= self.src.len { raise error.UnexpectedEnd; } + if self.src[self.pos] != 34 { raise error.UnexpectedToken; } // key must be a string + key := try self.parse_string(); + self.skip_ws(); + if self.pos >= self.src.len { raise error.UnexpectedEnd; } + if self.src[self.pos] != 58 { raise error.UnexpectedToken; } // ':' + self.pos += 1; + v := try self.parse_value(); + obj.put(key, v, self.alloc); + self.skip_ws(); + if self.pos >= self.src.len { raise error.UnexpectedEnd; } + c := self.src[self.pos]; + if c == 44 { self.pos += 1; } // ',' more + else if c == 125 { self.pos += 1; loop = false; } // '}' done + else { raise error.UnexpectedToken; } + } + return Value.object(obj); + } + + // Parse any single value (after skipping leading whitespace). + parse_value :: (self: *Parser) -> (Value, !JsonParseError) { + self.skip_ws(); + if self.pos >= self.src.len { raise error.UnexpectedEnd; } + c := self.src[self.pos]; + if c == 123 { return try self.parse_object(); } // '{' + if c == 91 { return try self.parse_array(); } // '[' + if c == 34 { s := try self.parse_string(); return Value.str(s); } // '"' + if c == 116 { try self.expect_lit("true"); return Value.bool_(true); } // 't' + if c == 102 { try self.expect_lit("false"); return Value.bool_(false); } // 'f' + if c == 110 { try self.expect_lit("null"); nv : Value = .null_; return nv; } // 'n' + if c == 45 or (c >= 48 and c <= 57) { n := try self.parse_number(); return Value.int_(n); } // '-' / digit + raise error.UnexpectedToken; + } +} + +// Parse a complete JSON document from `src` into the value model, using +// `alloc` for composite nodes and decoded (escaped) strings. Un-escaped +// string values are VIEWS into `src` and are valid only while `src` lives. +// Trailing non-whitespace after the value raises `error.TrailingGarbage`. +parse :: (src: string, alloc: Allocator) -> (Value, !JsonParseError) { + p := Parser.{ src = src, alloc = alloc }; + v := try p.parse_value(); + p.skip_ws(); + if p.pos != p.src.len { raise error.TrailingGarbage; } + return v; +}