From 179310d62b94c6fcf91709b8a13cdd58c5709cf9 Mon Sep 17 00:00:00 2001 From: agra Date: Mon, 25 May 2026 15:45:33 +0300 Subject: [PATCH] =?UTF-8?q?mem:=20Phase=201.4a=20=E2=80=94=20fat-pointer?= =?UTF-8?q?=20aggregates=20from=20`#run`=20serialize=20via=20host=20memory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 1.4 serializer left a silent malformed-const case: when the interp evaluated a `#run` returning a string (or anything with a fat pointer inside), the data field came in as a `.int` holding a libc host address. `LLVMConstInt(ptr_type, addr, 1)` happily emitted `i0 0` in the static const, and the runtime segfaulted on the first read. Phase 1.4a closes this for string and slice destinations. The signature of `valueToLLVMConst` now takes the IR `TypeId` (instead of just the LLVM type) and a borrowed `*Interpreter`. A new helper `serializeAggregateValue` splits on the IR type: - `string` / `slice` (fat pointer `{data, len}`): extract `len`, read that many bytes from the data field's address (via `interp.heapSlice` for `heap_ptr`, via a new `readHostBytes` for `byte_ptr` / `.int`, via slice indexing for string literals). Emit the bytes as a private global byte array using the existing `emitConstStringGlobal`. The fat-pointer aggregate's data ptr resolves to the byte array's address. - `struct`: walk the IR field types in lockstep with the value's fields; recurse with each declared field TypeId. This replaces the old LLVM-type-walk via `LLVMStructGetTypeAtIndex` which couldn't tell string-typed fields from generic ptr fields. - `array`: walk with the element TypeId. The remaining `.int → ptr` trap (a host address landing in a bare ptr field outside a fat pointer) now bails loudly with a named diagnostic identifying it as Phase 1.4a heap-walk follow-up territory. No practical trigger in-tree, so deferred. `Interpreter.heapSlice` promoted from package-private to `pub` so the serializer can read interp-managed heap data. Regression: `examples/136-comptime-string-global.sx` — `GREETING :: #run build_greeting();` where `build_greeting` returns `concat("hello", " world")`. Runtime prints `greeting = 'hello world'` and `greeting.len = 11`. Pre-1.4a this segfaulted on the first read. 158/158 example tests; chess clean on macOS / iOS sim / Android via `tools/verify-step.sh`. --- current/CHECKPOINT-MEM.md | 65 ++++-- examples/136-comptime-string-global.sx | 26 +++ src/ir/emit_llvm.zig | 195 +++++++++++++----- src/ir/interp.zig | 2 +- .../expected/136-comptime-string-global.exit | 1 + tests/expected/136-comptime-string-global.txt | 2 + 6 files changed, 231 insertions(+), 60 deletions(-) create mode 100644 examples/136-comptime-string-global.sx create mode 100644 tests/expected/136-comptime-string-global.exit create mode 100644 tests/expected/136-comptime-string-global.txt diff --git a/current/CHECKPOINT-MEM.md b/current/CHECKPOINT-MEM.md index 6997ccb..181cf8c 100644 --- a/current/CHECKPOINT-MEM.md +++ b/current/CHECKPOINT-MEM.md @@ -5,6 +5,41 @@ Tracking checkpoint for the mem.sx Zig-aligned implementation ## Last completed step +- **Phase 1.4a — IR `TypeId` threaded through `valueToLLVMConst`; + string/slice fat-pointer aggregates serialize by reading host + memory.** The Phase 1.4 serializer bailed on `heap_ptr` / `byte_ptr` + and silently emitted `i0 0` for the trap-case where a `.int` host + address landed in a ptr-typed slot. Now the call site at + `emit_llvm.zig:676` passes `global.ty` (TypeId) and `&interp_inst` + instead of just the LLVM type. The serializer splits on the IR + type: + + - `string` / `slice` (fat pointer `{data, len}`): extract `len`, + read that many bytes from the data field's address (heap_ptr → + `interp.heapSlice`; byte_ptr/int → raw process memory via a new + `readHostBytes` helper; string literal → direct slice). Emit + the bytes as a private global byte array via the existing + `emitConstStringGlobal` and use it as the aggregate's data ptr. + - `struct`: walk the IR field types in lockstep with the value + fields; recurse per field with its declared TypeId. Replaces + the old LLVM-type-walk via `LLVMStructGetTypeAtIndex` which + couldn't tell `string`-typed fields from generic ptr fields. + - `array`: walk elements with the element TypeId. + + The `.int → ptr` slot mismatch (a host address landing in a + non-fat-pointer ptr slot) now bails loudly with a named + diagnostic — that's the genuine heap-walk frontier where future + work would need to capture struct content recursively, not the + silent malformed-const we had before. + + `Interpreter.heapSlice` was promoted from package-private to + `pub` so the serializer can read interp heap. Regression at + `examples/136-comptime-string-global.sx`: `GREETING :: #run + build_greeting();` where `build_greeting` returns `concat("hello", + " world")` — runtime prints `greeting = 'hello world' / greeting.len + = 11`. Pre-1.4a this segfaulted. 158/158 example tests + chess + clean on all three platforms via `tools/verify-step.sh`. + - **Allocator `init` returns the state by value.** Building on the Option 3 lvalue-borrow rule, `GPA.init`, `Arena.init`, and `TrackingAllocator.init` now return `T` (not `*T`). The caller binds @@ -230,22 +265,17 @@ allocator). Open follow-ups, in roughly the order they make sense: -- **Phase 1.4a** — Thread IR `TypeId` (not just LLVM `LLVMTypeRef`) - through `valueToLLVMConst` so `heap_ptr` values from `#run` can be - serialized. Requires walking the struct/slice/primitive children - recursively; cycle detection via `(heap_id, type_id)` visited set. - Practical trigger: a `#run` that builds a `Widget.{}` and - protocol-erases via `xx`, producing a `heap_ptr` to the boxed - payload. None exists in-tree yet — surface it via a focused - regression alongside the implementation. +- **`.int → ptr` heap-walk follow-up.** Phase 1.4a handles the + fat-pointer aggregate case. A `.int` host-address landing in a + bare ptr field (e.g. a struct with a raw `[*]u8` member) still + bails. Requires recursive struct walking with cycle detection on + `(heap_id, type_id)` visited pairs. No practical trigger + in-tree; defer until a real `#run` site surfaces the need. - **`resolveType(null) -> .s64` audit.** The silent fallback at `lower.zig:8387` is still in place for every caller other than `lowerComptimeGlobal`. CLAUDE.md REJECTED PATTERNS forbids this shape. Survey callers; either make the default an error diagnostic or thread an inferred type per call site. -- **`tools/verify-step.sh` gate.** Run iOS sim + Android to confirm - this session's GlyphCache + Metal/Gles3 sweeps + Phase 1.4 didn't - regress non-macOS platforms. ## Phase 0.3 audit findings — chess allocator usage (closed) @@ -271,7 +301,18 @@ Allocator value naturally. ## Log -- **2026-05-25 (latest)** — Allocator `init` returns the state by +- **2026-05-25 (latest)** — Phase 1.4a shipped. `valueToLLVMConst` + takes IR `TypeId` (not LLVM type) + an interpreter handle. + String/slice fat pointers are serialized by capturing the + pointed-to bytes (via `interp.heapSlice` for heap_ptr, raw + process memory via new `readHostBytes` for byte_ptr / .int / + string literal) and emitting a private global byte array. Struct + / array aggregates recurse with declared field/element TypeIds. + The trap case (`.int` landing in a ptr slot outside a fat + pointer) bails loudly. `Interpreter.heapSlice` promoted to + `pub`. Regression: `examples/136-comptime-string-global.sx`. + 158/158 + chess green on all three platforms. +- **2026-05-25 (penultimate)** — Allocator `init` returns the state by value. GPA / Arena / TrackingAllocator all changed; `Arena.deinit` no longer self-deallocs. `UIPipeline.arena_a/_b` embedded as values; `@self.arena_a` at the *Arena use site. `examples/50-smoke.sx` diff --git a/examples/136-comptime-string-global.sx b/examples/136-comptime-string-global.sx new file mode 100644 index 0000000..abbe65d --- /dev/null +++ b/examples/136-comptime-string-global.sx @@ -0,0 +1,26 @@ +// Phase 1.4a — a `#run` that returns a string (or any aggregate +// containing a heap-allocated buffer) must serialize correctly into +// the static binary. The interp computes the string at build time, +// allocating its backing through `context.allocator` (which bottoms +// out at libc_malloc in the default context). The serializer reads +// the resulting `{addr, len}` aggregate, captures the bytes from +// host memory, emits them as a private global byte array, and +// rebuilds the aggregate to point at that array. +// +// Before Phase 1.4a this segfaulted at runtime — the pointer field +// in the static const ended up as `i0 0` (malformed) because the +// interp's host-address `.int` value can't be lowered as `ptr` by +// `LLVMConstInt`. +#import "modules/std.sx"; + +build_greeting :: () -> string { + return concat("hello", " world"); +} + +GREETING :: #run build_greeting(); + +main :: () -> s32 { + print("greeting = '{}'\n", GREETING); + print("greeting.len = {}\n", GREETING.len); + return 0; +} diff --git a/src/ir/emit_llvm.zig b/src/ir/emit_llvm.zig index 2f8fa18..d4a129d 100644 --- a/src/ir/emit_llvm.zig +++ b/src/ir/emit_llvm.zig @@ -673,7 +673,7 @@ pub const LLVMEmitter = struct { std.debug.print("error: comptime init of '{s}' failed: {s} (op={s}{s}{s})\n", .{ gname, @errorName(err), op, sep, detail }); break :blk .void_val; }; - const init_val = self.valueToLLVMConst(result, llvm_ty, self.ir_mod.types.getString(global.name)); + const init_val = self.valueToLLVMConst(result, global.ty, &interp_inst, self.ir_mod.types.getString(global.name)); c.LLVMSetInitializer(llvm_global, init_val); } else if (global.init_val) |iv| { const init_val = switch (iv) { @@ -731,67 +731,68 @@ pub const LLVMEmitter = struct { } } + /// Read `len` bytes from `addr` in the current process. Used to lift + /// comptime-evaluated heap data into a static binary constant — the + /// interp ran in this process, so any libc-malloc'd buffer it + /// produced is still mapped and readable. Returns `null` on a + /// null/zero address (callers handle empty-slice as a special case + /// before calling this). + fn readHostBytes(addr: usize, len: usize) ?[]const u8 { + if (addr == 0) return null; + const ptr: [*]const u8 = @ptrFromInt(addr); + return ptr[0..len]; + } + /// Serialize an interp `Value` to an LLVM constant for use as a static - /// global initializer. `global_name` is included in any diagnostic the - /// path produces, so the user can locate the offending `#run` site. + /// global initializer. `ty` is the IR-level type of the destination; + /// the LLVM type is derived from it. `interp` gives access to the + /// interpreter's heap so heap_ptr values can be walked. `global_name` + /// is included in any diagnostic the path produces so the user can + /// locate the offending `#run` site. + /// /// Returns `LLVMGetUndef` on bail — the build continues so adjacent - /// constants can still emit, but the surfaced diagnostic surfaces the - /// problem clearly. + /// constants can still emit, but the diagnostic makes the problem clear. fn valueToLLVMConst( self: *LLVMEmitter, val: Value, - llvm_ty: c.LLVMTypeRef, + ty: TypeId, + interp: *const Interpreter, global_name: []const u8, ) c.LLVMValueRef { + const llvm_ty = self.toLLVMType(ty); return switch (val) { - .int => |v| c.LLVMConstInt(llvm_ty, @bitCast(v), 1), + .int => |v| blk: { + // Host-pointer-as-int trap: the interp marshals raw pointers + // (libc-malloc'd buffers, etc.) into a .int that holds the + // host address. When that address is meant for a `ptr` slot + // in the destination type, emitting `LLVMConstInt` against + // the ptr type silently produces a malformed `i0 0`. The + // string/slice paths above handle this case by reading the + // pointed-to bytes; anything else with an int landing in a + // ptr slot is a Phase-1.4a heap-walk case we don't yet + // know how to serialize. + const kind = c.LLVMGetTypeKind(llvm_ty); + if (kind == c.LLVMPointerTypeKind) { + std.debug.print( + "error: comptime init of '{s}' produced a raw integer for a pointer field — needs IR-typed heap-walk serialization (Phase 1.4a heap-walk follow-up)\n", + .{global_name}, + ); + break :blk c.LLVMGetUndef(llvm_ty); + } + break :blk c.LLVMConstInt(llvm_ty, @bitCast(v), 1); + }, .float => |v| c.LLVMConstReal(llvm_ty, v), .boolean => |v| c.LLVMConstInt(llvm_ty, @intFromBool(v), 0), .null_val => c.LLVMConstNull(llvm_ty), .void_val, .undef => c.LLVMGetUndef(llvm_ty), .func_ref => |fid| self.func_map.get(fid.index()) orelse c.LLVMConstNull(llvm_ty), .string => |s| self.emitConstStringGlobal(s), - .aggregate => |fields| blk: { - const kind = c.LLVMGetTypeKind(llvm_ty); - if (kind == c.LLVMStructTypeKind) { - const field_count = c.LLVMCountStructElementTypes(llvm_ty); - if (field_count != @as(c_uint, @intCast(fields.len))) { - std.debug.print( - "error: comptime init of '{s}' produced aggregate with {} fields but the destination type expects {}\n", - .{ global_name, fields.len, field_count }, - ); - break :blk c.LLVMGetUndef(llvm_ty); - } - var field_vals = std.ArrayList(c.LLVMValueRef).empty; - defer field_vals.deinit(self.alloc); - for (fields, 0..) |f, i| { - const field_ty = c.LLVMStructGetTypeAtIndex(llvm_ty, @intCast(i)); - field_vals.append(self.alloc, self.valueToLLVMConst(f, field_ty, global_name)) catch unreachable; - } - break :blk c.LLVMConstNamedStruct(llvm_ty, field_vals.items.ptr, @intCast(field_vals.items.len)); - } - if (kind == c.LLVMArrayTypeKind) { - const elem_ty = c.LLVMGetElementType(llvm_ty); - var elem_vals = std.ArrayList(c.LLVMValueRef).empty; - defer elem_vals.deinit(self.alloc); - for (fields) |f| { - elem_vals.append(self.alloc, self.valueToLLVMConst(f, elem_ty, global_name)) catch unreachable; - } - break :blk c.LLVMConstArray2(elem_ty, elem_vals.items.ptr, @intCast(elem_vals.items.len)); - } - std.debug.print( - "error: comptime init of '{s}' produced an aggregate but the destination LLVM type is neither struct nor array (kind={})\n", - .{ global_name, kind }, - ); - break :blk c.LLVMGetUndef(llvm_ty); - }, + .aggregate => |fields| self.serializeAggregateValue(fields, ty, interp, global_name), // The remaining Value variants cannot become static binary - // constants. Bail loudly with the global name so the user can - // identify the offending #run site. - // - heap_ptr / byte_ptr: pointer into interp/host memory; can't survive into a binary const without type-threaded serialization (Phase 1.4a follow-up). - // - slot_ptr: frame-local; meaningless outside the call that produced it. - // - closure: env is dynamic. - // - type_tag: compile-time-only Type value. + // constants outside of a fat-pointer aggregate. Bail loudly. + // (`heap_ptr` / `byte_ptr` / `int → ptr` are handled inside + // `serializeAggregateValue` when they appear in a string or + // slice fat-pointer's data field.) .heap_ptr, .byte_ptr, .slot_ptr, .closure, .type_tag => blk: { std.debug.print( "error: comptime init of '{s}' produced a {s} value, which cannot be serialized as a static constant\n", @@ -802,6 +803,106 @@ pub const LLVMEmitter = struct { }; } + /// Helper for `valueToLLVMConst` — serialize an aggregate value + /// against an IR TypeId. Splits on the type: + /// + /// - `string` / `slice` — fat pointer `{ data, len }`. The data + /// field can be a heap_ptr (interp-managed memory), byte_ptr + /// (raw host address), int (same), or string literal. The len + /// field is consulted to know how many bytes to capture from + /// the data. Bytes are emitted as a private global byte array + /// and the aggregate constant points at it. + /// - `struct` — walk the IR field types in lockstep with the + /// value fields; recurse per field with its declared TypeId. + /// - `array` — walk elements with the array's element TypeId. + fn serializeAggregateValue( + self: *LLVMEmitter, + fields: []const Value, + ty: TypeId, + interp: *const Interpreter, + global_name: []const u8, + ) c.LLVMValueRef { + const llvm_ty = self.toLLVMType(ty); + + // Fat-pointer types: extract len, then read bytes from the data + // field's address (whatever flavour the interp produced for it). + const is_string = (ty == .string); + const is_slice = !ty.isBuiltin() and self.ir_mod.types.get(ty) == .slice; + if ((is_string or is_slice) and fields.len == 2) { + const data = fields[0]; + const len_i = fields[1].asInt() orelse { + std.debug.print( + "error: comptime init of '{s}' produced a fat-pointer aggregate whose len field is not an integer\n", + .{global_name}, + ); + return c.LLVMGetUndef(llvm_ty); + }; + const len: usize = @intCast(len_i); + + const bytes_opt: ?[]const u8 = switch (data) { + .heap_ptr => |hp| blk: { + const mem = interp.heapSlice(hp) orelse break :blk null; + break :blk if (len <= mem.len) mem[0..len] else null; + }, + .byte_ptr => |addr| readHostBytes(addr, len), + .int => |v| blk: { + if (v == 0 and len == 0) break :blk &.{}; // empty slice + if (v == 0) break :blk null; + break :blk readHostBytes(@as(usize, @bitCast(v)), len); + }, + .string => |s| if (len <= s.len) s[0..len] else null, + else => null, + }; + + const bytes = bytes_opt orelse { + std.debug.print( + "error: comptime init of '{s}' produced a fat-pointer aggregate whose data field ({s}) cannot be resolved to {} bytes — needs Phase 1.4a heap-walk for this shape\n", + .{ global_name, @tagName(data), len }, + ); + return c.LLVMGetUndef(llvm_ty); + }; + + return self.emitConstStringGlobal(bytes); + } + + // Generic struct: walk IR fields by their declared TypeIds. + if (!ty.isBuiltin()) { + const info = self.ir_mod.types.get(ty); + if (info == .@"struct") { + const ir_fields = info.@"struct".fields; + if (ir_fields.len != fields.len) { + std.debug.print( + "error: comptime init of '{s}' produced aggregate with {} fields but struct '{s}' expects {}\n", + .{ global_name, fields.len, self.ir_mod.types.getString(info.@"struct".name), ir_fields.len }, + ); + return c.LLVMGetUndef(llvm_ty); + } + var field_vals = std.ArrayList(c.LLVMValueRef).empty; + defer field_vals.deinit(self.alloc); + for (ir_fields, fields) |ir_field, fv| { + field_vals.append(self.alloc, self.valueToLLVMConst(fv, ir_field.ty, interp, global_name)) catch unreachable; + } + return c.LLVMConstNamedStruct(llvm_ty, field_vals.items.ptr, @intCast(field_vals.items.len)); + } + if (info == .array) { + const elem_ty = info.array.element; + const llvm_elem_ty = self.toLLVMType(elem_ty); + var elem_vals = std.ArrayList(c.LLVMValueRef).empty; + defer elem_vals.deinit(self.alloc); + for (fields) |fv| { + elem_vals.append(self.alloc, self.valueToLLVMConst(fv, elem_ty, interp, global_name)) catch unreachable; + } + return c.LLVMConstArray2(llvm_elem_ty, elem_vals.items.ptr, @intCast(elem_vals.items.len)); + } + } + + std.debug.print( + "error: comptime init of '{s}' produced an aggregate but the destination type ({s}) is neither struct, array, string, nor slice\n", + .{ global_name, self.ir_mod.types.typeName(ty) }, + ); + return c.LLVMGetUndef(llvm_ty); + } + // ── Function declaration ──────────────────────────────────────── fn declareFunction(self: *LLVMEmitter, func: *const Function, func_idx: u32) void { diff --git a/src/ir/interp.zig b/src/ir/interp.zig index cd6755a..47aeb00 100644 --- a/src/ir/interp.zig +++ b/src/ir/interp.zig @@ -296,7 +296,7 @@ pub const Interpreter = struct { } } - fn heapSlice(self: *const Interpreter, hp: Value.HeapPtr) ?[]u8 { + pub fn heapSlice(self: *const Interpreter, hp: Value.HeapPtr) ?[]u8 { if (hp.id >= self.heap.items.len) return null; const mem = self.heap.items[hp.id]; if (hp.offset >= mem.len) return null; diff --git a/tests/expected/136-comptime-string-global.exit b/tests/expected/136-comptime-string-global.exit new file mode 100644 index 0000000..573541a --- /dev/null +++ b/tests/expected/136-comptime-string-global.exit @@ -0,0 +1 @@ +0 diff --git a/tests/expected/136-comptime-string-global.txt b/tests/expected/136-comptime-string-global.txt new file mode 100644 index 0000000..7af7479 --- /dev/null +++ b/tests/expected/136-comptime-string-global.txt @@ -0,0 +1,2 @@ +greeting = 'hello world' +greeting.len = 11