fix(0110): for-over-array by-value fetch reads one element, not a full copy
lowerFor's by-value element fetch emitted index_get on the array VALUE; the emitter realizes that as a whole-array spill to a stack temp + GEP, per iteration — O(N^2) bytes copied per loop (and pre-0109 it also grew the stack per iteration, segfaulting a [4096]s64 loop). When the iterable is an array with addressable storage (and not deref'd from a pointer, whose identifier alloca holds the pointer rather than the array), the fetch is now index_gep on the storage + one element load. Storage-less arrays keep the index_get fallback. The loaded element remains a copy — mutating the capture does not write back. Regression: examples/0048-basic-for-array-large.sx (sum over 4096 elements + by-value copy-guard).
This commit is contained in:
24
examples/0048-basic-for-array-large.sx
Normal file
24
examples/0048-basic-for-array-large.sx
Normal file
@@ -0,0 +1,24 @@
|
||||
// Collection-form `for` over an array, by-value capture: each iteration
|
||||
// reads ONE element from the array's storage (GEP + load), and the capture
|
||||
// stays a copy — mutating it never writes back to the array.
|
||||
// Regression (issue 0110): the element fetch was `index_get` on the array
|
||||
// VALUE, spilling a full copy of the array to a stack temp per iteration —
|
||||
// O(N²) bytes copied, and (pre-0109) per-iteration stack growth that made
|
||||
// this 4096-element loop segfault.
|
||||
|
||||
#import "modules/std.sx";
|
||||
|
||||
main :: () -> s32 {
|
||||
arr : [4096]s64 = ---;
|
||||
i := 0;
|
||||
while i < 4096 { arr[i] = i; i += 1; }
|
||||
sum := 0;
|
||||
for arr: (x) { sum += x; }
|
||||
print("sum={}\n", sum);
|
||||
|
||||
// By-value capture is a copy: mutating it leaves the array untouched.
|
||||
small : [3]s64 = .[10, 20, 30];
|
||||
for small: (x) { x += 100; }
|
||||
print("copy-guard: {} {} {}\n", small[0], small[1], small[2]);
|
||||
0
|
||||
}
|
||||
1
examples/expected/0048-basic-for-array-large.exit
Normal file
1
examples/expected/0048-basic-for-array-large.exit
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
1
examples/expected/0048-basic-for-array-large.stderr
Normal file
1
examples/expected/0048-basic-for-array-large.stderr
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
2
examples/expected/0048-basic-for-array-large.stdout
Normal file
2
examples/expected/0048-basic-for-array-large.stdout
Normal file
@@ -0,0 +1,2 @@
|
||||
sum=8386560
|
||||
copy-guard: 10 20 30
|
||||
112
issues/0110-for-array-by-value-full-array-spill.md
Normal file
112
issues/0110-for-array-by-value-full-array-spill.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# RESOLVED — 0110: `for arr: (x)` over an array copies the ENTIRE array per element
|
||||
|
||||
**Root cause:** `lowerFor`'s by-value element fetch emitted `index_get` on the
|
||||
array *value*; the emitter realizes that as spill-whole-array-to-temp + GEP one
|
||||
element, per iteration — O(N²) bytes copied (and pre-0109, per-iteration stack
|
||||
growth that segfaulted a `[4096]s64` loop).
|
||||
|
||||
**Fix:** in `src/ir/lower/control_flow.zig` `lowerFor`, when the iterable is an
|
||||
array with addressable storage (`getExprAlloca` hit, and the iterable was not
|
||||
deref'd from a pointer — a deref'd identifier's alloca holds the pointer, not
|
||||
the array), the by-value fetch is `index_gep` on the storage + a single element
|
||||
`load`. Storage-less arrays and the deref'd-pointer case keep the `index_get`
|
||||
fallback. Capture semantics unchanged: the loaded element is still a copy
|
||||
(mutating it does not write back), matching the by-ref branch's base resolution.
|
||||
|
||||
**Regression test:** `examples/0048-basic-for-array-large.sx` (4096-element sum
|
||||
prints `sum=8386560`, segfaulted pre-fix; copy-guard confirms by-value capture
|
||||
mutation leaves the array untouched).
|
||||
|
||||
---
|
||||
|
||||
# 0110 — `for arr: (x)` over an array copies the ENTIRE array per element
|
||||
|
||||
**Symptom.** The collection-form `for` with by-value capture over an array
|
||||
lowers the element fetch as `index_get` on the array *value*, which the LLVM
|
||||
emitter realizes as: load the whole array as an SSA value, spill it to a
|
||||
fresh `ig.tmp` alloca, GEP one element. Per iteration. Observed: a `for` over
|
||||
a `[4096]s64` array segfaults (4096 iterations × 32KB spill = ~134MB of stack
|
||||
— see issue 0109 for why body allocas never unwind); a `[256]s64` version
|
||||
completes but copies 256 × 2KB = 512KB to read 2KB of data. Expected: O(1)
|
||||
stack, O(N) total work — GEP into the array's existing storage and load the
|
||||
single element.
|
||||
|
||||
Even after 0109's entry-block hoist (which collapses the spill to one reused
|
||||
slot and fixes the crash), the full-array copy per element remains — O(N²)
|
||||
bytes copied per loop. The by-ref capture path (`for arr: (*x)`) already does
|
||||
this right: it GEPs into the array's alloca (`getExprAlloca`) without
|
||||
copying. By-value should reuse the same base and just add a load.
|
||||
|
||||
## Reproduction
|
||||
|
||||
```sx
|
||||
#import "modules/std.sx";
|
||||
|
||||
main :: () -> s32 {
|
||||
arr : [4096]s64 = ---;
|
||||
i := 0;
|
||||
while i < 4096 { arr[i] = i; i += 1; }
|
||||
sum := 0;
|
||||
for arr: (x) { sum += x; }
|
||||
print("sum={}\n", sum);
|
||||
0
|
||||
}
|
||||
```
|
||||
|
||||
- **Observed**: `Segmentation fault` (stack overflow). With `256` instead of
|
||||
`4096` it prints `sum=32640` — and `sx ir` shows the per-iteration spill:
|
||||
|
||||
```llvm
|
||||
for.body.4:
|
||||
%ig.tmp = alloca [256 x i64], align 8 ; fresh full-array temp
|
||||
store [256 x i64] %load6, ptr %ig.tmp, align 8 ; copy ALL 256 elements
|
||||
%ig.ptr = getelementptr [256 x i64], ptr %ig.tmp, i64 0, i64 %load8
|
||||
%ig.val = load i64, ptr %ig.ptr, align 8 ; ...to read ONE
|
||||
```
|
||||
|
||||
- **Expected**: `sum=8386560`, exit 0; the body GEPs `arr`'s own storage:
|
||||
`getelementptr` on the original alloca + a single `i64` load, no array-sized
|
||||
temp, no copy.
|
||||
|
||||
Repro co-located: `issues/0110-for-array-by-value-full-array-spill.sx`
|
||||
(unpinned until fixed; depends on 0109 only for the crash, not for the copy).
|
||||
|
||||
## Root cause (suspected area)
|
||||
|
||||
`src/ir/lower/control_flow.zig` `lowerFor` (~336-343): the by-ref branch
|
||||
builds `index_gep` on the array's storage (`getExprAlloca(fe.iterable) orelse
|
||||
iterable`), but the by-value branch emits
|
||||
`.index_get = .{ .lhs = iterable, .rhs = idx_val }` where `iterable` is the
|
||||
loaded array **value**. The emitter's `index_get` on an aggregate SSA value
|
||||
(`src/ir/emit_llvm.zig`, `ig.tmp` site ~2100s) has no address to index, so it
|
||||
spills the whole value first.
|
||||
|
||||
Fix shape: in `lowerFor`, when the iterable is an array with storage
|
||||
(`getExprAlloca` hit — same condition the by-ref branch uses), lower the
|
||||
by-value element as `index_gep` on that storage followed by a `load` of the
|
||||
element type, instead of `index_get` on the value. Capture semantics are
|
||||
unchanged: the load still produces a per-iteration copy of the *element*
|
||||
(mutating `x` must not write back; mutating the array inside the body is
|
||||
already visible to subsequent iterations through the same storage in the
|
||||
by-ref form — match existing behavior with a test). Keep the `index_get`
|
||||
fallback for storage-less iterables (e.g. an array returned by a call).
|
||||
|
||||
## Investigation prompt (paste into a fresh session)
|
||||
|
||||
> Fix issue 0110: collection-form `for arr: (x)` (by-value capture) over an
|
||||
> array spills the entire array per iteration. In
|
||||
> `src/ir/lower/control_flow.zig` `lowerFor` (~336-343), make the by-value
|
||||
> element fetch reuse the by-ref branch's base resolution: if the iterable is
|
||||
> an `.array` and `getExprAlloca(fe.iterable)` (or the deref'd pointer base)
|
||||
> yields storage, emit `index_gep` on that storage + `builder.load` of the
|
||||
> element type; otherwise keep the existing `index_get` path. Slices/List
|
||||
> views are unaffected (their `index_get` is already pointer-backed).
|
||||
>
|
||||
> Verify: `./zig-out/bin/sx run issues/0110-for-array-by-value-full-array-spill.sx`
|
||||
> prints `sum=8386560` exit 0 (after 0109 lands; before it, verify the
|
||||
> 256-element variant's `sx ir` shows no `ig.tmp` array spill in
|
||||
> `for.body.*`). Add a semantics guard test: mutate `x` in the body and
|
||||
> confirm the array is unchanged after the loop (by-value capture stays a
|
||||
> copy). Then `zig build && zig build test && bash tests/run_examples.sh`,
|
||||
> review any `.ir` snapshot diffs, and promote the repro to
|
||||
> `examples/00xx-basic-for-array-large.sx`.
|
||||
@@ -289,11 +289,15 @@ pub fn lowerFor(self: *Lowering, fe: *const ast.ForExpr) Ref {
|
||||
var iterable = self.lowerExpr(fe.iterable);
|
||||
var iterable_ty = self.inferExprType(fe.iterable);
|
||||
|
||||
// `*List` / `*[]T` etc. — deref to the collection value.
|
||||
// `*List` / `*[]T` etc. — deref to the collection value. Tracked because
|
||||
// a deref'd iterable's identifier binding holds the POINTER, so its
|
||||
// alloca is not the collection's storage.
|
||||
var was_deref = false;
|
||||
const ptr_info = if (iterable_ty.isBuiltin()) null else self.module.types.get(iterable_ty);
|
||||
if (ptr_info != null and ptr_info.? == .pointer) {
|
||||
iterable = self.builder.load(iterable, ptr_info.?.pointer.pointee);
|
||||
iterable_ty = ptr_info.?.pointer.pointee;
|
||||
was_deref = true;
|
||||
}
|
||||
|
||||
// A `List(T)`-like struct iterates its `items[0..len]`; arrays/slices
|
||||
@@ -333,14 +337,25 @@ pub fn lowerFor(self: *Lowering, fe: *const ast.ForExpr) Ref {
|
||||
// binds a value copy.
|
||||
const elem_ty = self.getElementType(iterable_ty);
|
||||
const bind_ty = if (fe.capture_by_ref) self.module.types.ptrTo(elem_ty) else elem_ty;
|
||||
const is_array = !iterable_ty.isBuiltin() and self.module.types.get(iterable_ty) == .array;
|
||||
const elem = if (fe.capture_by_ref) blk: {
|
||||
// A slice value carries its backing pointer, so GEP on it writes
|
||||
// through. An array is a value — GEP needs its storage (alloca) or
|
||||
// mutations would hit a copy.
|
||||
const is_array = !iterable_ty.isBuiltin() and self.module.types.get(iterable_ty) == .array;
|
||||
const base = if (is_array) (self.getExprAlloca(fe.iterable) orelse iterable) else iterable;
|
||||
break :blk self.builder.emit(.{ .index_gep = .{ .lhs = base, .rhs = idx_val } }, bind_ty);
|
||||
} else self.builder.emit(.{ .index_get = .{ .lhs = iterable, .rhs = idx_val } }, bind_ty);
|
||||
} else blk: {
|
||||
// By-value over an array with addressable storage: GEP + load ONE
|
||||
// element. `index_get` on the array VALUE spills the whole array to
|
||||
// a temp on every iteration — O(N²) bytes copied per loop.
|
||||
if (is_array and !was_deref) {
|
||||
if (self.getExprAlloca(fe.iterable)) |storage| {
|
||||
const elem_ptr = self.builder.emit(.{ .index_gep = .{ .lhs = storage, .rhs = idx_val } }, self.module.types.ptrTo(elem_ty));
|
||||
break :blk self.builder.load(elem_ptr, elem_ty);
|
||||
}
|
||||
}
|
||||
break :blk self.builder.emit(.{ .index_get = .{ .lhs = iterable, .rhs = idx_val } }, bind_ty);
|
||||
};
|
||||
|
||||
var body_scope = Scope.init(self.alloc, self.scope);
|
||||
const old_scope = self.scope;
|
||||
|
||||
Reference in New Issue
Block a user