From 2871342c0a2ff30452ff374020172028504ee15a Mon Sep 17 00:00:00 2001 From: agra Date: Thu, 4 Jun 2026 02:32:32 +0300 Subject: [PATCH] F2.2: reject raw control bytes (U+0000..U+001F) in JSON strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parse_string scanned for `"` and `\` but accepted every other byte, including raw control characters. RFC 8259 §7 requires those bytes to be escaped inside a string; an unescaped one is invalid JSON and must surface a parse error, not be silently accepted. Add `BadControlChar` to JsonParseError and reject any unescaped byte < 0x20 in the string body scan (which gates the decode path too, so escaped forms like \t/\n/ still decode correctly; 0x20 and 0x7F are not over-rejected). Regression test in examples/0714: raw 0x09/0x0A/0x00 each raise BadControlChar via `?`/`!`; a positive case proves the escaped forms still decode to the right bytes. All prior assertions kept. --- examples/0714-modules-json-reader.sx | 24 +++++++++++++++++++ .../expected/0714-modules-json-reader.stdout | 7 ++++++ library/modules/std/json.sx | 15 ++++++++---- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/examples/0714-modules-json-reader.sx b/examples/0714-modules-json-reader.sx index f7e1b79..4305b44 100644 --- a/examples/0714-modules-json-reader.sx +++ b/examples/0714-modules-json-reader.sx @@ -42,6 +42,15 @@ raises :: (src: string, want: JsonParseError, alloc: Allocator) -> bool { e == want } +// True when parsing `"ab"` (a string holding the RAW control byte `b`) +// raises BadControlChar. Built from a byte buffer because a raw control +// byte can't appear in an sx string literal. +ctrl_raises :: (b: u8, alloc: Allocator) -> bool { + raw : [5]u8 = ---; + raw[0] = 34; raw[1] = 97; raw[2] = b; raw[3] = 98; raw[4] = 34; // "ab" + return raises(string.{ ptr = @raw[0], len = 5 }, error.BadControlChar, alloc); +} + main :: () -> ! { gpa := GPA.init(); arena := Arena.init(xx gpa, 8192); @@ -125,6 +134,21 @@ main :: () -> ! { report("err-overflow", raises("9223372036854775808", error.BadNumber, xx arena)); report("err-unterminated", raises("\"abc", error.UnexpectedEnd, xx arena)); + // ── 7. RFC 8259 §7: unescaped control bytes (U+0000..U+001F) ────── + // A RAW control byte inside a string is invalid JSON -> BadControlChar. + report("err-raw-tab", ctrl_raises(9, xx arena)); // raw 0x09 + report("err-raw-lf", ctrl_raises(10, xx arena)); // raw 0x0A + report("err-raw-nul", ctrl_raises(0, xx arena)); // raw 0x00 + + // POSITIVE: the ESCAPED control forms stay valid and decode to the + // exact bytes. JSON "\t\n\u0009" -> 0x09 0x0A 0x09 (3 bytes). + esc := try parse("\"\\t\\n\\u0009\"", xx arena); + es := esc.str; + report("esc-ctrl-len", es.len == 3); + report("esc-tab", es[0] == 0x09); // \t + report("esc-lf", es[1] == 0x0A); // \n + report("esc-u", es[2] == 0x09); // \u0009 + print("=== DONE ===\n"); return; } diff --git a/examples/expected/0714-modules-json-reader.stdout b/examples/expected/0714-modules-json-reader.stdout index fb03da7..b4848e2 100644 --- a/examples/expected/0714-modules-json-reader.stdout +++ b/examples/expected/0714-modules-json-reader.stdout @@ -33,4 +33,11 @@ err-fraction: ok err-leading-zero: ok err-overflow: ok err-unterminated: ok +err-raw-tab: ok +err-raw-lf: ok +err-raw-nul: ok +esc-ctrl-len: ok +esc-tab: ok +esc-lf: ok +esc-u: ok === DONE === diff --git a/library/modules/std/json.sx b/library/modules/std/json.sx index 92667f1..8b537c7 100644 --- a/library/modules/std/json.sx +++ b/library/modules/std/json.sx @@ -349,9 +349,11 @@ write_to_file :: (v: Value, file: *File, staging: []u8) -> !JsonError { // // NOT SUPPORTED (rejected, not silently accepted): a fraction or exponent // in a number (`1.5`, `1e9`) → `BadNumber`; a number outside s64 → -// `BadNumber`; a leading-zero integer (`01`) → `BadNumber`. UNESCAPED raw -// control bytes (< 0x20) inside a string are passed through verbatim (the -// minimal-reader leniency the manifest / db.json never exercise). +// `BadNumber`; a leading-zero integer (`01`) → `BadNumber`. An UNESCAPED +// raw control byte (U+0000..U+001F) inside a string → `BadControlChar` +// (RFC 8259 §7 requires those bytes to be escaped); the escaped forms +// (`\t`, `\n`, `\u0009`, …) stay valid and decode normally. Bytes >= 0x20, +// including 0x7F (DEL) and UTF-8 continuation bytes (>= 0x80), pass through. // // HEAP DISCIPLINE (binding, see heap-discipline.md). Exactly two kinds of // allocation happen, both through the EXPLICIT `alloc` parameter, never @@ -377,7 +379,7 @@ write_to_file :: (v: Value, file: *File, staging: []u8) -> !JsonError { // The reader's failure contract. Meaningful variants so a caller can tell // a truncated document from a bad escape from trailing junk. -JsonParseError :: error { UnexpectedToken, UnexpectedEnd, BadEscape, BadNumber, TrailingGarbage } +JsonParseError :: error { UnexpectedToken, UnexpectedEnd, BadEscape, BadNumber, TrailingGarbage, BadControlChar } // Lowercase/uppercase hex nibble value (0..15) of an ASCII byte; a non-hex // byte in a `\uXXXX` escape is a `BadEscape`. @@ -518,6 +520,11 @@ Parser :: struct { has_escape = true; i += 1; if i >= self.src.len { raise error.UnexpectedEnd; } + } else if c < 32 { + // RFC 8259 §7: a raw control byte (U+0000..U+001F) must be + // escaped inside a string; an unescaped one is invalid JSON. + self.pos = i; + raise error.BadControlChar; } i += 1; }