From 967005621a3b3ae84843d573b2f65fb8ad37b673 Mon Sep 17 00:00:00 2001 From: agra Date: Mon, 15 Jun 2026 22:47:34 +0300 Subject: [PATCH] =?UTF-8?q?feat(asm):=20Phase=202=20=E2=80=94=20`->=20@pla?= =?UTF-8?q?ce`=20write-through=20outputs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An asm result can be STORED through a place (a local / struct field) instead of returned; the place output does not join the result tuple. - parser.zig: `-> @place` parses `@place` as an ordinary address-of expression → an out_place operand (the in-function form; reuses the existing `@` prefix). - inst.zig: AsmOperand gains out_ty (the output slot's value type) so emit can build the combined return struct without re-deriving from Inst.ty. - lower/expr.zig: out_place operand = the lowered @place address, out_ty = the pointee. Read-write (`+`) and indirect-memory (`*`) constraints rejected loudly (not yet implemented) rather than miscompiled. - ops.zig emitInlineAsm: the LLVM return type is built from ALL outputs (out_value + out_place); after the call, out_place slots are stored through their address and out_value slots rebuild the sx result. Fast path when there are no place outputs (the struct return IS the result — pure-value asm IR unchanged). Verified: write-to-local (42), struct field, mixed value+place (v=10 b=20), `+` rejected. Locked with 1649-platform-asm-place-output (mixed, runs on aarch64). zig build test green (657 corpus, 446 unit). --- current/CHECKPOINT-ASM.md | 54 ++++++++++++---- examples/1649-platform-asm-place-output.sx | 19 ++++++ .../1649-platform-asm-place-output.build | 1 + .../1649-platform-asm-place-output.exit | 1 + .../1649-platform-asm-place-output.ir | 25 ++++++++ .../1649-platform-asm-place-output.stderr | 1 + .../1649-platform-asm-place-output.stdout | 1 + src/backend/llvm/ops.zig | 62 +++++++++++++++++-- src/ir/inst.zig | 9 ++- src/ir/lower/expr.zig | 34 +++++++++- src/parser.zig | 15 +++-- 11 files changed, 198 insertions(+), 24 deletions(-) create mode 100644 examples/1649-platform-asm-place-output.sx create mode 100644 examples/expected/1649-platform-asm-place-output.build create mode 100644 examples/expected/1649-platform-asm-place-output.exit create mode 100644 examples/expected/1649-platform-asm-place-output.ir create mode 100644 examples/expected/1649-platform-asm-place-output.stderr create mode 100644 examples/expected/1649-platform-asm-place-output.stdout diff --git a/current/CHECKPOINT-ASM.md b/current/CHECKPOINT-ASM.md index 86fdd31..bdb8841 100644 --- a/current/CHECKPOINT-ASM.md +++ b/current/CHECKPOINT-ASM.md @@ -6,7 +6,25 @@ commit, one step at a time per the cadence rule (no commit may both add a test and make it pass). ## Last completed step -**F** — global (module-scope) asm. A top-level `asm { "tmpl", };` block (template +**2** — `-> @place` write-through outputs. An asm result can be **stored through +a place** (local / struct field) instead of returned; the place output does NOT +join the result tuple. Parser: `-> @place` parses the `@place` as an ordinary +address-of expression → an `out_place` operand (`src/parser.zig`). Lowering +(`lowerAsmExpr`): out_place operand = the lowered `@place` address, `out_ty` = +the pointee; read-write (`+`) and indirect-memory (`*`) constraints rejected +loudly (not yet implemented). Added `out_ty: TypeId` to the IR `AsmOperand` +(`src/ir/inst.zig`) so emit builds the **combined** return struct (ALL outputs). +`emitInlineAsm` rewrite (`src/backend/llvm/ops.zig`): the LLVM return type is now +built from every output's `out_ty`; after the call, out_place slots are +`store`d through their address and out_value slots rebuild the sx result — with a +**fast path** (no place outputs → the asm's struct return IS the result, so +pure-value asm IR is unchanged). Verified: write-to-local (`get42`→42), struct +field (`@p.b`), mixed value+place (`v=10 b=20`), `+` rejected. Locked with +`examples/1649-platform-asm-place-output.sx` (mixed, runs on aarch64). `zig build +test` green (657 corpus, 446 unit). Files: `src/parser.zig`, `src/ir/inst.zig`, +`src/ir/lower/expr.zig`, `src/backend/llvm/ops.zig`, `examples/1649-*`. + +Prior: **F** — global (module-scope) asm. A top-level `asm { "tmpl", };` block (template only) lowers to LLVM `module asm`, and a lib-less `extern` calls into the symbols it defines. New `asm_global` AST node (`src/ast.zig`) + `parseAsmGlobal` (`src/parser.zig`, dispatched from `parseTopLevel` on `kw_asm`) — rejects @@ -155,8 +173,9 @@ pipeline: lex (A.0) → parse (A.1) → validate (B.0/B.1 + `%[name]` check) → tuples (E). Register-class + register-pinned operands, inputs, clobbers, `#string` multi-instruction templates, `%[name]`/`%%` rewriting, and the §II.5 auto-naming rule all work and execute on the host JIT. Global `asm { … }` (Phase F) works AOT (call-into-asm -via lib-less `extern`). **Remaining feature gap:** `-> @place` write-through / -read-write / indirect-memory outputs (rejected at parse — Phase 2). Smaller +via lib-less `extern`). `-> @place` **write-through** outputs work (Phase 2); +read-write (`+`) and indirect-memory (`*`) place outputs are rejected loudly as +not-yet-implemented — the remaining feature work. Smaller follow-ups: the comptime-call guard for global asm (`#run` into a module-asm symbol should fail loud via dlsym-miss — pin a test), a JIT-vs-global-asm note (`sx run` silently mishandles module-asm symbols; AOT is correct), and the x86_64 @@ -172,17 +191,20 @@ Phase E–F feasibility already confirmed against the live tree `extern`, 60 sites; `--target` a global CLI flag). ## Next step -**Phase 2 — `-> @place` outputs** (the last feature gap): write-through -(`"=…" -> @place`), read-write (`"+…" -> @place`), and indirect-memory (`"=*m"`) -outputs, currently rejected at parse. Needs: parse `-> @` into an -`out_place` operand (payload = the place expr), lower the place to an address + -`store` the asm result through it (place outputs don't join the result tuple), -the `+` read-write seeding, and output-to-`const` rejection. See `PLAN-ASM.md` -Phase G / design §II.2 Dev 5 + cookbook (`cas`, `memcpy_bytes`, `cpuid_into`). +Inline assembly is **feature-complete for the common surface**. Remaining work, +all optional / additive (pick any): +- **Read-write (`"+…" -> @place`) place outputs**: LLVM expresses `+` as an + output `=` + a TIED input (`0` referencing the output index), with the seed + value passed as an arg — Zig's `llvm_rw_vals` mechanism. Currently rejected at + lowering. Needs the tied-input plumbing in `emitInlineAsm` + seeding a load of + the place. +- **Indirect-memory (`"=*m"`) outputs**: pass the place address as an arg, asm + writes through it (no return slot). Currently rejected. +- **Output-to-`const` rejection** for `-> @place` (the place must be mutable). +- **Polish**: comptime-call guard test for global asm; make `sx run` error (not + silently mishandle) a module-asm symbol; x86_64 syscall-write ir-only example. -Smaller polish (any order): comptime-call guard test for global asm; `sx run` -should error (not silently mishandle) a module-asm symbol; x86_64 syscall-write -ir-only example; `readme.md` inline-asm section. Orthogonal: **issue 0137**. +Orthogonal: **issue 0137** (no-`main` segfault). ## Log - (init) Plan + design doc written; ASM stream opened. @@ -227,6 +249,12 @@ ir-only example; `readme.md` inline-asm section. Orthogonal: **issue 0137**. volatile/operands); `Module.global_asm` captured in `lowerMainAndComptime`; `emit()` appends via `LLVMAppendModuleInlineAsm`; call-into via lib-less `extern`. AOT-verified (1648, `_my_add`→42). `zig build test` green (656 corpus). +- (docs) readme.md "Inline Assembly" section (b8800a2). +- (2) `-> @place` write-through — `out_place` operand; `out_ty` on the IR + AsmOperand; `emitInlineAsm` builds the combined output struct + splits + (out_place → store-through, out_value → result), fast-path when no places. + `+`/`*` rejected. Locked with 1649 (mixed, runs). `zig build test` green (657 + corpus, 446 unit). ## Known issues - **0137** — `sx run` on a program with no `main` segfaults (unguarded JIT entry diff --git a/examples/1649-platform-asm-place-output.sx b/examples/1649-platform-asm-place-output.sx new file mode 100644 index 0000000..f4746ff --- /dev/null +++ b/examples/1649-platform-asm-place-output.sx @@ -0,0 +1,19 @@ +// ASM stream Phase 2 — `-> @place` write-through output. An asm result can be +// STORED through a place (a local / struct field) instead of returned: the +// place output does NOT join the result tuple. Here one value output is +// returned (into `main_val`) while a second is written through `@other`. The +// two are combined to 42. Read-write (`+`) and indirect (`*`) place outputs are +// not yet implemented (rejected at lowering). aarch64-pinned; ir-only elsewhere. +compute :: () -> i64 { + other : i64 = 0; + main_val := asm volatile { + #string ASM + mov %[m], #5 + mov %[o], #37 +ASM, + [m] "=r" -> i64, // value output → returned + [o] "=r" -> @other, // place output → stored through @other + }; + return main_val + other; // 5 + 37 = 42 +} +main :: () -> i64 { return compute(); } diff --git a/examples/expected/1649-platform-asm-place-output.build b/examples/expected/1649-platform-asm-place-output.build new file mode 100644 index 0000000..42e24dd --- /dev/null +++ b/examples/expected/1649-platform-asm-place-output.build @@ -0,0 +1 @@ +{ "target": "macos" } diff --git a/examples/expected/1649-platform-asm-place-output.exit b/examples/expected/1649-platform-asm-place-output.exit new file mode 100644 index 0000000..d81cc07 --- /dev/null +++ b/examples/expected/1649-platform-asm-place-output.exit @@ -0,0 +1 @@ +42 diff --git a/examples/expected/1649-platform-asm-place-output.ir b/examples/expected/1649-platform-asm-place-output.ir new file mode 100644 index 0000000..4883443 --- /dev/null +++ b/examples/expected/1649-platform-asm-place-output.ir @@ -0,0 +1,25 @@ + +; Function Attrs: nounwind +define internal i64 @compute() #0 { +entry: + %alloca = alloca i64, align 8 + store i64 0, ptr %alloca, align 8 + %asm = call { i64, i64 } asm sideeffect " mov ${0}, #5\0A mov ${1}, #37\0A", "=r,=r"() + %asm.out = extractvalue { i64, i64 } %asm, 0 + %asm.out1 = extractvalue { i64, i64 } %asm, 1 + store i64 %asm.out1, ptr %alloca, align 8 + %allocaN = alloca i64, align 8 + store i64 %asm.out, ptr %allocaN, align 8 + %load = load i64, ptr %allocaN, align 8 + %loadN = load i64, ptr %alloca, align 8 + %add = add i64 %load, %loadN + ret i64 %add +} + +; Function Attrs: nounwind +define i32 @main() #0 { +entry: + %call = call i64 @compute() + %ca.tr = trunc i64 %call to i32 + ret i32 %ca.tr +} diff --git a/examples/expected/1649-platform-asm-place-output.stderr b/examples/expected/1649-platform-asm-place-output.stderr new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/examples/expected/1649-platform-asm-place-output.stderr @@ -0,0 +1 @@ + diff --git a/examples/expected/1649-platform-asm-place-output.stdout b/examples/expected/1649-platform-asm-place-output.stdout new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/examples/expected/1649-platform-asm-place-output.stdout @@ -0,0 +1 @@ + diff --git a/src/backend/llvm/ops.zig b/src/backend/llvm/ops.zig index 757856d..440678b 100644 --- a/src/backend/llvm/ops.zig +++ b/src/backend/llvm/ops.zig @@ -789,8 +789,22 @@ pub const Ops = struct { if (op.role == .input) n_inputs += 1; } - // Result LLVM type: void (no value output) or the single scalar. - const ret_ty = if (instruction.ty == .void) e.cached_void else e.toLLVMType(instruction.ty); + // Combined LLVM return type: ALL outputs (out_value + out_place) in + // source order, each as its `out_ty`. out_place outputs come back in a + // return slot too — they get `store`d through their address below; only + // out_value outputs join the sx result. 0 → void, 1 → scalar, N → struct. + var out_llvm: std.ArrayList(c.LLVMTypeRef) = .empty; + defer out_llvm.deinit(alloc); + for (a.operands) |op| { + if (op.role == .input) continue; + out_llvm.append(alloc, e.toLLVMType(op.out_ty)) catch unreachable; + } + const n_out = out_llvm.items.len; + const ret_ty: c.LLVMTypeRef = switch (n_out) { + 0 => e.cached_void, + 1 => out_llvm.items[0], + else => c.LLVMStructTypeInContext(e.context, out_llvm.items.ptr, @intCast(n_out), 0), + }; // One LLVM call param per input operand, in source order. const param_types = alloc.alloc(c.LLVMTypeRef, n_inputs) catch unreachable; @@ -838,8 +852,48 @@ pub const Ops = struct { c.LLVMInlineAsmDialectATT, 0, // CanThrow ); - const label: [*:0]const u8 = if (instruction.ty == .void) "" else "asm"; - const result = c.LLVMBuildCall2(e.builder, fn_ty, asm_val, call_args.ptr, @intCast(n_inputs), label); + const label: [*:0]const u8 = if (n_out == 0) "" else "asm"; + const raw_result = c.LLVMBuildCall2(e.builder, fn_ty, asm_val, call_args.ptr, @intCast(n_inputs), label); + + // Fast path — no write-through outputs: every output is a value output, + // so the asm's return (void / scalar / `{T…}` struct) IS the sx result + // (the struct already matches sx's tuple representation). No split. + var has_place = false; + for (a.operands) |op| { + if (op.role == .out_place) has_place = true; + } + if (!has_place) { + e.mapRef(raw_result); + return; + } + + // ── Mixed/place outputs (source order): out_place → `store` the slot + // through its address; out_value → collect, then rebuild the sx result + // (0 → void/place-only call · 1 → that value · N → tuple `insertvalue`). ── + var value_vals: std.ArrayList(c.LLVMValueRef) = .empty; + defer value_vals.deinit(alloc); + var slot: c_uint = 0; + for (a.operands) |op| { + if (op.role == .input) continue; + const v = if (n_out == 1) raw_result else c.LLVMBuildExtractValue(e.builder, raw_result, slot, "asm.out"); + slot += 1; + if (op.role == .out_place) { + _ = c.LLVMBuildStore(e.builder, v, e.resolveRef(op.operand)); + } else { + value_vals.append(alloc, v) catch unreachable; + } + } + + const result: c.LLVMValueRef = blk: { + if (value_vals.items.len == 0) break :blk raw_result; + if (value_vals.items.len == 1) break :blk value_vals.items[0]; + const tuple_ty = e.toLLVMType(instruction.ty); + var agg = c.LLVMGetUndef(tuple_ty); + for (value_vals.items, 0..) |v, j| { + agg = c.LLVMBuildInsertValue(e.builder, agg, v, @intCast(j), "asm.tup"); + } + break :blk agg; + }; // Always mapRef — the IR Ref counter advances regardless of result type. e.mapRef(result); } diff --git a/src/ir/inst.zig b/src/ir/inst.zig index 28c8ab6..7fd6905 100644 --- a/src/ir/inst.zig +++ b/src/ir/inst.zig @@ -368,8 +368,15 @@ pub const InlineAsm = struct { name: StringId, /// Verbatim constraint, e.g. "={rax}", "=r", "+r", "{rdi}", "r". constraint: StringId, - /// `input` → the value `Ref`; `out_value` → `.none` (the asm yields it). + /// `input` → the value `Ref`; `out_value` → `.none` (the asm yields it); + /// `out_place` → the place ADDRESS `Ref` (a pointer; the asm result is + /// `store`d through it). operand: Ref, + /// The value type carried by an OUTPUT slot — `out_value`: its result + /// type; `out_place`: the pointee type stored through `operand`. `.void` + /// for inputs (their type comes from the input `Ref`). Lets emit build + /// the combined LLVM return struct without re-deriving from `Inst.ty`. + out_ty: TypeId = .void, pub const Role = enum { out_value, out_place, input }; }; diff --git a/src/ir/lower/expr.zig b/src/ir/lower/expr.zig index 852e397..03d12c4 100644 --- a/src/ir/lower/expr.zig +++ b/src/ir/lower/expr.zig @@ -2339,6 +2339,36 @@ pub fn lowerAsmExpr(self: *Lowering, ae: *const ast.AsmExpr, span: ast.Span) Ref // Effective name (design §II.5): explicit `[name]`, else auto-derived // from a `{reg}` pin, else anonymous (`.empty`). const eff_name: []const u8 = op.name orelse (pinnedRegister(op.constraint) orelse ""); + var operand_ref: Ref = Ref.none; + var out_ty: TypeId = .void; + switch (op.role) { + .input => operand_ref = self.lowerExpr(op.payload), + .out_value => out_ty = self.resolveTypeWithBindings(op.payload), + .out_place => { + // Read-write (`+`) and indirect-memory (`*`) place outputs aren't + // implemented yet — reject loudly rather than miscompile (§II.11). + if (op.constraint.len > 0 and op.constraint[0] == '+') { + diags.addFmt(.err, span, "read-write (`+`) asm outputs are not yet implemented; use a write-only `=` output", .{}); + return self.emitPlaceholder("inline_asm"); + } + if (std.mem.indexOfScalar(u8, op.constraint, '*') != null) { + diags.addFmt(.err, span, "indirect-memory (`*`) asm outputs are not yet implemented", .{}); + return self.emitPlaceholder("inline_asm"); + } + // `@place` lowers to its address (a pointer); the asm result is + // stored through it. The stored type is the pointee. + operand_ref = self.lowerExpr(op.payload); + const pty = self.inferExprType(op.payload); + out_ty = if (!pty.isBuiltin()) blk: { + const info = self.module.types.get(pty); + break :blk if (info == .pointer) info.pointer.pointee else .unresolved; + } else .unresolved; + if (out_ty == .unresolved) { + diags.addFmt(.err, span, "asm `-> @place` output target must be an addressable place", .{}); + return self.emitPlaceholder("inline_asm"); + } + }, + } ir_ops[i] = .{ .role = switch (op.role) { .out_value => .out_value, @@ -2347,8 +2377,8 @@ pub fn lowerAsmExpr(self: *Lowering, ae: *const ast.AsmExpr, span: ast.Span) Ref }, .name = if (eff_name.len == 0) types.StringId.empty else self.module.types.internString(eff_name), .constraint = self.module.types.internString(op.constraint), - // input → the lowered value Ref; an output yields its value (none). - .operand = if (op.role == .input) self.lowerExpr(op.payload) else Ref.none, + .operand = operand_ref, + .out_ty = out_ty, }; } diff --git a/src/parser.zig b/src/parser.zig index 8a8f6f4..fd77942 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -2780,10 +2780,17 @@ pub const Parser = struct { var payload: *Node = undefined; if (self.current.tag == .arrow) { self.advance(); - if (self.current.tag == .at) - return self.fail("`-> @place` write-through asm outputs are not supported yet (Phase 2); use a `-> Type` value output"); - role = .out_value; - payload = try self.parseTypeExpr(); + if (self.current.tag == .at) { + // `-> @place`: write-through output. `@place` is parsed as an + // ordinary address-of expression (a pointer); lowering stores + // the asm result through it. The output does NOT join the + // result tuple. + role = .out_place; + payload = try self.parseUnary(); + } else { + role = .out_value; + payload = try self.parseTypeExpr(); + } } else if (self.current.tag == .equal) { self.advance(); role = .input;