feat(asm): Phase E — multi-output asm returns tuples

Replaces the N>1 "Phase E" bail with a shared asmResultType helper (lowering + inferType) that derives the result type from the out_value operands: 0→void, 1→T, N→a named tuple (fields named via the §II.5 effective-name rule). Key realization: toLLVMType(tuple) already produces a literal struct {T1,…,Tn} — exactly what LLVM's multi-output inline asm returns — so emit needs NO change. Building the op with a tuple result type makes the asm call return the struct, which IS sx's tuple value (destructured by the normal tuple_get path). inferType's .asm_expr arm now also delegates to asmResultType (single owner), so `return asm`, `x := asm`, and `q, r := asm` all agree on the type. Verified end-to-end on aarch64: split(0x1234)→(lo=52,hi=18), a udiv/msub divmod→(3,2). IR: `call { i64, i64 } asm "divq ${4}", "={rax},={rdx},{rax},{rdx},r,~{cc}"(…)` → extractvalue → tuple. 1640 → the x86_64 multi-output IR lock (ir-only); 1647 → a multi-output example that runs on aarch64. zig build test green (655 corpus, 446 unit).
2026-06-15 21:55:38 +03:00
parent 5a5e04c6d5
commit d3c6ffed5a
15 changed files with 178 additions and 84 deletions
--- a/examples/1640-platform-asm-parse.sx
+++ b/examples/1640-platform-asm-parse.sx
@@ -1,10 +1,10 @@
-// ASM stream — `asm { … }` parses + validates the full rich shape: named value
-// outputs (`[quot] "={rax}" -> u64`), register-pinned inputs, and a
-// `clobbers(.…)` clause, all accepted. This is a MULTI-output (tuple-returning)
-// asm, which is deferred to Phase E — so lowering bails LOUD + named with the
-// specific "Phase E" diagnostic (single-output asm already runs; see 1645).
-// Called from `main` so lowering reaches the asm body (lazy lowering skips
-// uncalled functions).
+// ASM stream Phase E — x86_64 multi-output asm: `divq` produces quotient in rax
+// and remainder in rdx, returned as a `(quot, rem)` tuple. Two `={rax}`/`={rdx}`
+// value outputs ⇒ LLVM returns a `{ i64, i64 }` struct, which IS sx's tuple
+// representation (so `q, r := …` destructures it directly). x86-pinned via
+// `.build`: ir-only on a non-x86 host (the `.ir` snapshot locks the struct
+// return + `%[name]` rewrite); runs natively on x86_64-linux. See 1647 for a
+// multi-output example that executes on aarch64.
 divmod :: (n: u64, d: u64) -> (quot: u64, rem: u64) {
    return asm {
        "divq %[d]",
--- a/examples/1647-platform-asm-aarch64-multi.sx
+++ b/examples/1647-platform-asm-aarch64-multi.sx
@@ -0,0 +1,20 @@
+// ASM stream Phase E — multi-output asm that RUNS end-to-end on aarch64. Splits
+// a value into low/high bytes via two value outputs, returned + destructured as
+// a `(lo, hi)` tuple. The two outputs become an LLVM `{ i64, i64 }` struct =
+// sx's tuple. aarch64-pinned via `.build`: executes on a matching host (exit
+// reflects lo+hi), ir-only elsewhere.
+split :: (x: u64) -> (lo: u64, hi: u64) {
+    return asm {
+        #string ASM
+        and %[l], %[x], #0xff
+        lsr %[h], %[x], #8
+ASM,
+        [l] "=r" -> u64,
+        [h] "=r" -> u64,
+        [x] "r" = x,
+    };
+}
+main :: () -> i64 {
+    lo, hi := split(0x1234);
+    return xx (lo + hi);   // 0x34 + 0x12 = 52 + 18 = 70
+}
--- a/examples/expected/1640-platform-asm-parse.build
+++ b/examples/expected/1640-platform-asm-parse.build
@@ -0,0 +1 @@
+{ "target": "x86_64-linux" }
--- a/examples/expected/1640-platform-asm-parse.exit
+++ b/examples/expected/1640-platform-asm-parse.exit
@@ -1 +1 @@
-1
+0
--- a/examples/expected/1640-platform-asm-parse.ir
+++ b/examples/expected/1640-platform-asm-parse.ir
@@ -0,0 +1,26 @@
+
+; Function Attrs: nounwind
+define internal { i64, i64 } @divmod(i64 %0, i64 %1) #0 {
+entry:
+  %alloca = alloca i64, align 8
+  store i64 %0, ptr %alloca, align 8
+  %allocaN = alloca i64, align 8
+  store i64 %1, ptr %allocaN, align 8
+  %load = load i64, ptr %alloca, align 8
+  %loadN = load i64, ptr %allocaN, align 8
+  %asm = call { i64, i64 } asm "divq ${4}", "={rax},={rdx},{rax},{rdx},r,~{cc}"(i64 %load, i64 0, i64 %loadN)
+  ret { i64, i64 } %asm
+}
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %call = call { i64, i64 } @divmod(i64 17, i64 5)
+  %tg = extractvalue { i64, i64 } %call, 0
+  %alloca = alloca i64, align 8
+  store i64 %tg, ptr %alloca, align 8
+  %tgN = extractvalue { i64, i64 } %call, 1
+  %allocaN = alloca i64, align 8
+  store i64 %tgN, ptr %allocaN, align 8
+  ret i32 0
+}
--- a/examples/expected/1640-platform-asm-parse.stderr
+++ b/examples/expected/1640-platform-asm-parse.stderr
@@ -1,17 +1 @@
-error: multi-output (tuple-returning) inline assembly is not yet implemented (ASM stream Phase E)
-  --> examples/1640-platform-asm-parse.sx:9:12
-   |
- 9 |     return asm {
-   |            ^^^^^
-10 |         "divq %[d]",
-   | ^^^^^^^^^^^^^^^^^^^^
-11 |         [quot] "={rax}" -> u64,
-   | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-12 |         [rem]  "={rdx}" -> u64,
-   | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-13 |         "{rax}" = n,  "{rdx}" = 0,  [d] "r" = d,
-   | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-14 |         clobbers(.cc),
-   | ^^^^^^^^^^^^^^^^^^^^^^
-15 |     };
-   | ^^^^^
+
--- a/examples/expected/1647-platform-asm-aarch64-multi.build
+++ b/examples/expected/1647-platform-asm-aarch64-multi.build
@@ -0,0 +1 @@
+{ "target": "macos" }
--- a/examples/expected/1647-platform-asm-aarch64-multi.exit
+++ b/examples/expected/1647-platform-asm-aarch64-multi.exit
@@ -0,0 +1 @@
+70
--- a/examples/expected/1647-platform-asm-aarch64-multi.ir
+++ b/examples/expected/1647-platform-asm-aarch64-multi.ir
@@ -0,0 +1,31 @@
+
+; Function Attrs: nounwind
+define internal { i64, i64 } @split(i64 %0) #0 {
+entry:
+  %alloca = alloca i64, align 8
+  store i64 %0, ptr %alloca, align 8
+  %load = load i64, ptr %alloca, align 8
+  %asm = call { i64, i64 } asm "        and ${0}, ${2}, #0xff\0A        lsr ${1}, ${2}, #8\0A", "=r,=r,r"(i64 %load)
+  %tg = extractvalue { i64, i64 } %asm, 0
+  %tgN = extractvalue { i64, i64 } %asm, 1
+  %ti = insertvalue { i64, i64 } undef, i64 %tg, 0
+  %tiN = insertvalue { i64, i64 } %ti, i64 %tgN, 1
+  ret { i64, i64 } %tiN
+}
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %call = call { i64, i64 } @split(i64 4660)
+  %tg = extractvalue { i64, i64 } %call, 0
+  %alloca = alloca i64, align 8
+  store i64 %tg, ptr %alloca, align 8
+  %tgN = extractvalue { i64, i64 } %call, 1
+  %allocaN = alloca i64, align 8
+  store i64 %tgN, ptr %allocaN, align 8
+  %load = load i64, ptr %alloca, align 8
+  %loadN = load i64, ptr %allocaN, align 8
+  %add = add i64 %load, %loadN
+  %ca.tr = trunc i64 %add to i32
+  ret i32 %ca.tr
+}
--- a/examples/expected/1647-platform-asm-aarch64-multi.stderr
+++ b/examples/expected/1647-platform-asm-aarch64-multi.stderr
@@ -0,0 +1 @@
+
--- a/examples/expected/1647-platform-asm-aarch64-multi.stdout
+++ b/examples/expected/1647-platform-asm-aarch64-multi.stdout
@@ -0,0 +1 @@
+