From bf471460851eeded56829b4f71f5e16a9f26a80f Mon Sep 17 00:00:00 2001 From: agra Date: Wed, 10 Jun 2026 17:34:35 +0300 Subject: [PATCH] fix(0110): for-over-array by-value fetch reads one element, not a full copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lowerFor's by-value element fetch emitted index_get on the array VALUE; the emitter realizes that as a whole-array spill to a stack temp + GEP, per iteration — O(N^2) bytes copied per loop (and pre-0109 it also grew the stack per iteration, segfaulting a [4096]s64 loop). When the iterable is an array with addressable storage (and not deref'd from a pointer, whose identifier alloca holds the pointer rather than the array), the fetch is now index_gep on the storage + one element load. Storage-less arrays keep the index_get fallback. The loaded element remains a copy — mutating the capture does not write back. Regression: examples/0048-basic-for-array-large.sx (sum over 4096 elements + by-value copy-guard). --- examples/0048-basic-for-array-large.sx | 24 ++++ .../expected/0048-basic-for-array-large.exit | 1 + .../0048-basic-for-array-large.stderr | 1 + .../0048-basic-for-array-large.stdout | 2 + ...110-for-array-by-value-full-array-spill.md | 112 ++++++++++++++++++ src/ir/lower/control_flow.zig | 21 +++- 6 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 examples/0048-basic-for-array-large.sx create mode 100644 examples/expected/0048-basic-for-array-large.exit create mode 100644 examples/expected/0048-basic-for-array-large.stderr create mode 100644 examples/expected/0048-basic-for-array-large.stdout create mode 100644 issues/0110-for-array-by-value-full-array-spill.md diff --git a/examples/0048-basic-for-array-large.sx b/examples/0048-basic-for-array-large.sx new file mode 100644 index 0000000..27cea3f --- /dev/null +++ b/examples/0048-basic-for-array-large.sx @@ -0,0 +1,24 @@ +// Collection-form `for` over an array, by-value capture: each iteration +// reads ONE element from the array's storage (GEP + load), and the capture +// stays a copy — mutating it never writes back to the array. +// Regression (issue 0110): the element fetch was `index_get` on the array +// VALUE, spilling a full copy of the array to a stack temp per iteration — +// O(N²) bytes copied, and (pre-0109) per-iteration stack growth that made +// this 4096-element loop segfault. + +#import "modules/std.sx"; + +main :: () -> s32 { + arr : [4096]s64 = ---; + i := 0; + while i < 4096 { arr[i] = i; i += 1; } + sum := 0; + for arr: (x) { sum += x; } + print("sum={}\n", sum); + + // By-value capture is a copy: mutating it leaves the array untouched. + small : [3]s64 = .[10, 20, 30]; + for small: (x) { x += 100; } + print("copy-guard: {} {} {}\n", small[0], small[1], small[2]); + 0 +} diff --git a/examples/expected/0048-basic-for-array-large.exit b/examples/expected/0048-basic-for-array-large.exit new file mode 100644 index 0000000..573541a --- /dev/null +++ b/examples/expected/0048-basic-for-array-large.exit @@ -0,0 +1 @@ +0 diff --git a/examples/expected/0048-basic-for-array-large.stderr b/examples/expected/0048-basic-for-array-large.stderr new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/examples/expected/0048-basic-for-array-large.stderr @@ -0,0 +1 @@ + diff --git a/examples/expected/0048-basic-for-array-large.stdout b/examples/expected/0048-basic-for-array-large.stdout new file mode 100644 index 0000000..142d31b --- /dev/null +++ b/examples/expected/0048-basic-for-array-large.stdout @@ -0,0 +1,2 @@ +sum=8386560 +copy-guard: 10 20 30 diff --git a/issues/0110-for-array-by-value-full-array-spill.md b/issues/0110-for-array-by-value-full-array-spill.md new file mode 100644 index 0000000..c1c36bf --- /dev/null +++ b/issues/0110-for-array-by-value-full-array-spill.md @@ -0,0 +1,112 @@ +# RESOLVED — 0110: `for arr: (x)` over an array copies the ENTIRE array per element + +**Root cause:** `lowerFor`'s by-value element fetch emitted `index_get` on the +array *value*; the emitter realizes that as spill-whole-array-to-temp + GEP one +element, per iteration — O(N²) bytes copied (and pre-0109, per-iteration stack +growth that segfaulted a `[4096]s64` loop). + +**Fix:** in `src/ir/lower/control_flow.zig` `lowerFor`, when the iterable is an +array with addressable storage (`getExprAlloca` hit, and the iterable was not +deref'd from a pointer — a deref'd identifier's alloca holds the pointer, not +the array), the by-value fetch is `index_gep` on the storage + a single element +`load`. Storage-less arrays and the deref'd-pointer case keep the `index_get` +fallback. Capture semantics unchanged: the loaded element is still a copy +(mutating it does not write back), matching the by-ref branch's base resolution. + +**Regression test:** `examples/0048-basic-for-array-large.sx` (4096-element sum +prints `sum=8386560`, segfaulted pre-fix; copy-guard confirms by-value capture +mutation leaves the array untouched). + +--- + +# 0110 — `for arr: (x)` over an array copies the ENTIRE array per element + +**Symptom.** The collection-form `for` with by-value capture over an array +lowers the element fetch as `index_get` on the array *value*, which the LLVM +emitter realizes as: load the whole array as an SSA value, spill it to a +fresh `ig.tmp` alloca, GEP one element. Per iteration. Observed: a `for` over +a `[4096]s64` array segfaults (4096 iterations × 32KB spill = ~134MB of stack +— see issue 0109 for why body allocas never unwind); a `[256]s64` version +completes but copies 256 × 2KB = 512KB to read 2KB of data. Expected: O(1) +stack, O(N) total work — GEP into the array's existing storage and load the +single element. + +Even after 0109's entry-block hoist (which collapses the spill to one reused +slot and fixes the crash), the full-array copy per element remains — O(N²) +bytes copied per loop. The by-ref capture path (`for arr: (*x)`) already does +this right: it GEPs into the array's alloca (`getExprAlloca`) without +copying. By-value should reuse the same base and just add a load. + +## Reproduction + +```sx +#import "modules/std.sx"; + +main :: () -> s32 { + arr : [4096]s64 = ---; + i := 0; + while i < 4096 { arr[i] = i; i += 1; } + sum := 0; + for arr: (x) { sum += x; } + print("sum={}\n", sum); + 0 +} +``` + +- **Observed**: `Segmentation fault` (stack overflow). With `256` instead of + `4096` it prints `sum=32640` — and `sx ir` shows the per-iteration spill: + +```llvm +for.body.4: + %ig.tmp = alloca [256 x i64], align 8 ; fresh full-array temp + store [256 x i64] %load6, ptr %ig.tmp, align 8 ; copy ALL 256 elements + %ig.ptr = getelementptr [256 x i64], ptr %ig.tmp, i64 0, i64 %load8 + %ig.val = load i64, ptr %ig.ptr, align 8 ; ...to read ONE +``` + +- **Expected**: `sum=8386560`, exit 0; the body GEPs `arr`'s own storage: + `getelementptr` on the original alloca + a single `i64` load, no array-sized + temp, no copy. + +Repro co-located: `issues/0110-for-array-by-value-full-array-spill.sx` +(unpinned until fixed; depends on 0109 only for the crash, not for the copy). + +## Root cause (suspected area) + +`src/ir/lower/control_flow.zig` `lowerFor` (~336-343): the by-ref branch +builds `index_gep` on the array's storage (`getExprAlloca(fe.iterable) orelse +iterable`), but the by-value branch emits +`.index_get = .{ .lhs = iterable, .rhs = idx_val }` where `iterable` is the +loaded array **value**. The emitter's `index_get` on an aggregate SSA value +(`src/ir/emit_llvm.zig`, `ig.tmp` site ~2100s) has no address to index, so it +spills the whole value first. + +Fix shape: in `lowerFor`, when the iterable is an array with storage +(`getExprAlloca` hit — same condition the by-ref branch uses), lower the +by-value element as `index_gep` on that storage followed by a `load` of the +element type, instead of `index_get` on the value. Capture semantics are +unchanged: the load still produces a per-iteration copy of the *element* +(mutating `x` must not write back; mutating the array inside the body is +already visible to subsequent iterations through the same storage in the +by-ref form — match existing behavior with a test). Keep the `index_get` +fallback for storage-less iterables (e.g. an array returned by a call). + +## Investigation prompt (paste into a fresh session) + +> Fix issue 0110: collection-form `for arr: (x)` (by-value capture) over an +> array spills the entire array per iteration. In +> `src/ir/lower/control_flow.zig` `lowerFor` (~336-343), make the by-value +> element fetch reuse the by-ref branch's base resolution: if the iterable is +> an `.array` and `getExprAlloca(fe.iterable)` (or the deref'd pointer base) +> yields storage, emit `index_gep` on that storage + `builder.load` of the +> element type; otherwise keep the existing `index_get` path. Slices/List +> views are unaffected (their `index_get` is already pointer-backed). +> +> Verify: `./zig-out/bin/sx run issues/0110-for-array-by-value-full-array-spill.sx` +> prints `sum=8386560` exit 0 (after 0109 lands; before it, verify the +> 256-element variant's `sx ir` shows no `ig.tmp` array spill in +> `for.body.*`). Add a semantics guard test: mutate `x` in the body and +> confirm the array is unchanged after the loop (by-value capture stays a +> copy). Then `zig build && zig build test && bash tests/run_examples.sh`, +> review any `.ir` snapshot diffs, and promote the repro to +> `examples/00xx-basic-for-array-large.sx`. diff --git a/src/ir/lower/control_flow.zig b/src/ir/lower/control_flow.zig index efa1152..efbe6db 100644 --- a/src/ir/lower/control_flow.zig +++ b/src/ir/lower/control_flow.zig @@ -289,11 +289,15 @@ pub fn lowerFor(self: *Lowering, fe: *const ast.ForExpr) Ref { var iterable = self.lowerExpr(fe.iterable); var iterable_ty = self.inferExprType(fe.iterable); - // `*List` / `*[]T` etc. — deref to the collection value. + // `*List` / `*[]T` etc. — deref to the collection value. Tracked because + // a deref'd iterable's identifier binding holds the POINTER, so its + // alloca is not the collection's storage. + var was_deref = false; const ptr_info = if (iterable_ty.isBuiltin()) null else self.module.types.get(iterable_ty); if (ptr_info != null and ptr_info.? == .pointer) { iterable = self.builder.load(iterable, ptr_info.?.pointer.pointee); iterable_ty = ptr_info.?.pointer.pointee; + was_deref = true; } // A `List(T)`-like struct iterates its `items[0..len]`; arrays/slices @@ -333,14 +337,25 @@ pub fn lowerFor(self: *Lowering, fe: *const ast.ForExpr) Ref { // binds a value copy. const elem_ty = self.getElementType(iterable_ty); const bind_ty = if (fe.capture_by_ref) self.module.types.ptrTo(elem_ty) else elem_ty; + const is_array = !iterable_ty.isBuiltin() and self.module.types.get(iterable_ty) == .array; const elem = if (fe.capture_by_ref) blk: { // A slice value carries its backing pointer, so GEP on it writes // through. An array is a value — GEP needs its storage (alloca) or // mutations would hit a copy. - const is_array = !iterable_ty.isBuiltin() and self.module.types.get(iterable_ty) == .array; const base = if (is_array) (self.getExprAlloca(fe.iterable) orelse iterable) else iterable; break :blk self.builder.emit(.{ .index_gep = .{ .lhs = base, .rhs = idx_val } }, bind_ty); - } else self.builder.emit(.{ .index_get = .{ .lhs = iterable, .rhs = idx_val } }, bind_ty); + } else blk: { + // By-value over an array with addressable storage: GEP + load ONE + // element. `index_get` on the array VALUE spills the whole array to + // a temp on every iteration — O(N²) bytes copied per loop. + if (is_array and !was_deref) { + if (self.getExprAlloca(fe.iterable)) |storage| { + const elem_ptr = self.builder.emit(.{ .index_gep = .{ .lhs = storage, .rhs = idx_val } }, self.module.types.ptrTo(elem_ty)); + break :blk self.builder.load(elem_ptr, elem_ty); + } + } + break :blk self.builder.emit(.{ .index_get = .{ .lhs = iterable, .rhs = idx_val } }, bind_ty); + }; var body_scope = Scope.init(self.alloc, self.scope); const old_scope = self.scope;