diff --git a/examples/issue-0024.sx b/examples/issue-0024.sx new file mode 100644 index 0000000..da7f3c1 --- /dev/null +++ b/examples/issue-0024.sx @@ -0,0 +1,90 @@ +// issue-0024: NSLog/foreign-side-effect calls placed as the FIRST statement +// of an `if X { ... } else { ... }` branch body do not produce visible +// output, even when the branch is provably taken (the SECOND statement in +// the same body — also a foreign call — does produce output). +// +// ── Observed iOS-side symptom (session 59 bisect) ───────────────────────── +// +// In library/modules/gpu/metal.sx's `metal_create_texture_ios`: +// +// slot : TextureSlot = .{ tex = tex, bytes_per_pixel = bytes_per_pixel }; +// self.textures.append(slot); +// NSLog(ns_string("[metal] T6 appended\n".ptr)); // ← fires +// +// pixels_null := pixels == null; +// if pixels_null { +// NSLog(ns_string("[metal] T6b null\n".ptr)); // ← never fires +// } else { +// NSLog(ns_string("[metal] T6a non-null\n".ptr)); // ← never fires +// handle : u32 = xx self.textures.len; +// metal_update_texture_region_ios(self, handle, 0, 0, w, h, pixels); +// // ← DOES fire +// // (its first +// // NSLog at +// // fn entry +// // appears in +// // the unified +// // log) +// NSLog(ns_string("[metal] T7 done\n".ptr)); // ← (helper crashed +// // before this) +// } +// +// T6 appears in the iOS unified log. T6a/T6b never appear. The else +// branch's helper call DOES fire (its own first-statement NSLog inside +// the helper appears). So the else-branch IS entered; just its first +// NSLog statement produces no output. +// +// ── Pure-sx repro below does NOT trigger ─────────────────────────────────── +// +// Running `sx run examples/issue-0024.sx` exits 0 (counter == 4 — all +// bumps fired). The bug only manifests with foreign calls (NSLog / ns_string), +// and possibly only when the process subsequently crashes (replaceRegion +// in the metal.sx case) — which raises the alternative hypothesis that +// the missing NSLog output is just iOS unified-logging buffer-loss on +// process death, not a sx compiler bug. The runtime sequence between T6 +// and the crash was ~500μs; logs within ~1ms of an unhandled exception +// can be lost to OSLog's internal buffering on Apple Silicon iOS-sim. +// +// ── Investigation plan ───────────────────────────────────────────────────── +// +// Two paths to disambiguate: +// 1. Replace NSLog markers with `write(STDERR_FILENO, ...)` calls +// (synchronous, no OSLog involvement). If markers still don't appear: +// sx compiler bug — likely in src/ir/lower.zig:2166-2196 (the +// `is_value` branch of `lowerIfExpr` and downstream `lowerBlockValue` +// around 922-948). Possible: side-effecting leading statements +// dropped when branches are treated as values. +// 2. If markers DO appear with synchronous write: the iOS-side symptom +// is unified-logging buffer-loss, not a compiler bug. Close this issue +// as "wontfix — diagnostic limitation" and move the iOS debugging to +// foreign-write tracing. +// +// ── Real-world impact ────────────────────────────────────────────────────── +// +// Bisecting issue-0026 (replaceRegion crash) is currently blocked: without +// trustworthy markers inside if/else branches we can't tell which arg +// arrives wrong. Resolution unblocks step 3b of the Metal port. + +#import "modules/std.sx"; + +counter : s64 = 0; + +bump :: () { counter = counter + 1; } + +probe :: (skip: bool) { + bump(); + if skip { + bump(); + bump(); + } else { + bump(); + bump(); + } + bump(); +} + +main :: () -> s32 { + probe(false); + // counter == 4 (entry + 2 in false branch + exit) → exit 0 + if counter == 4 then 0 else 1; +} diff --git a/examples/issue-0025.sx b/examples/issue-0025.sx new file mode 100644 index 0000000..6df5a4e --- /dev/null +++ b/examples/issue-0025.sx @@ -0,0 +1,94 @@ +// issue-0025: Composite types larger than 16 bytes are passed without the +// LLVM `byval()` attribute, and the `call_indirect` (fn-pointer cast) +// path doesn't apply C-ABI parameter coercion at all. Both gaps cause +// silent shape-mismatch when sx code calls foreign C functions that take +// large aggregates by value, OR when sx code calls a sx fn through a +// fn-pointer typed with a large-aggregate parameter. +// +// ── Two failing forms ───────────────────────────────────────────────────── +// +// (A) Direct call to a sx function with a >16B param: +// +// Wide :: struct { a: s64; b: s64; c: s64; d: s64; } // 32 bytes +// accept :: (w: Wide) -> s64 { w.a + w.b + w.c + w.d; } +// accept(Wide.{ a = 1, b = 10, c = 100, d = 1000 }) // expect 1111 +// +// src/ir/emit_llvm.zig:2747-2795 (`abiCoerceParamType`): +// - <=8 bytes → coerced to i64 +// - 9-16 bytes → coerced to [2 x i64] +// - >16 bytes → returns llvm_ty unchanged with TODO at line 2793 +// +// The TODO is the bug: large composites should be coerced to `ptr` +// with a `byval(struct.T)` LLVM attribute. LLVM's mid-end then +// materializes the right machine code per target. Today the struct +// is left as-is, which LLVM tries to pass across registers + stack +// slots in ways that don't match the C ABI promise. +// +// (B) Indirect call via fn-pointer cast (the `xx objc_msgSend` idiom): +// +// fn_ptr : (Wide) -> s64 = xx accept; +// fn_ptr(Wide.{ ... }) +// +// src/ir/emit_llvm.zig:902-967 (`.call_indirect`): both the +// FunctionInfo-known arm (939-952) and the LLVMTypeOf-fallback arm +// (953-956) construct `param_tys[j]` WITHOUT routing through +// `abiCoerceParamType`. So even if (A) is fixed, fn-pointer-cast call +// sites still mis-marshal large composites. +// +// ── Real-world impact ────────────────────────────────────────────────────── +// +// Every `xx objc_msgSend` call site in library/modules/platform/uikit.sx +// + library/modules/gpu/metal.sx. Works in practice today only because: +// - We never pass aggregates >16 bytes by value through fn-pointer casts +// (workaround: declare param as `*T` + pass `@local`; arm64 AAPCS's +// indirect-by-ref happens to match this machine-state-wise). +// - HFAs (CGSize 2×f64, MTLClearColor 4×f64, CGRect 4×f64 as return) +// are correctly classified at emit_llvm.zig:2766-2779. +// +// ── Workarounds in use ───────────────────────────────────────────────────── +// +// library/modules/gpu/metal.sx declares MTLRegion (48B) + MTLScissorRect +// (32B) call sites with `*MTLRegion` / `*MTLScissorRect` and passes +// `@region` / `@rect`. Should not be needed once this issue is fixed. +// +// ── Fix sketch ───────────────────────────────────────────────────────────── +// +// (A) emit_llvm.zig:2793 — return `ptr` and emit `byval(struct.T)` on +// the param via `LLVMAddCallSiteAttribute` / `LLVMCreateTypeAttribute`. +// At call sites, alloca + memcpy + pass the alloca pointer. Apply +// identically at function-definition emission so direct calls roundtrip. +// +// (B) emit_llvm.zig:902-967 — factor out a helper +// `coerceCallParams(param_count, src_args, dst_fn_param_tys) +// -> (coerced_args, coerced_tys)` that wraps `abiCoerceParamType`. +// Use the helper from both arms. +// +// Edge cases to preserve: +// - Variadic foreign functions (printf family) — variadic tail per +// AAPCS64 still passes composites in their natural form. Keep +// existing behavior for variadic args. +// - HFAs already handled at 2766-2779 — don't touch. +// - Structs <=8 bytes coerced to `i64`, 9-16 bytes to `[2 x i64]` — +// don't touch. + +#import "modules/std.sx"; + +Wide :: struct { + a: s64; b: s64; c: s64; d: s64; +} + +accept :: (w: Wide) -> s64 { + w.a + w.b + w.c + w.d; +} + +main :: () -> s32 { + w := Wide.{ a = 1, b = 10, c = 100, d = 1000 }; + direct := accept(w); // exercises path (A) + if direct != 1111 { return 1; } + + fn_ptr : (Wide) -> s64 = xx accept; + indirect := fn_ptr(w); // exercises path (B) + if indirect != 1111 { return 2; } + + 0; +} diff --git a/examples/issue-0026.sx b/examples/issue-0026.sx new file mode 100644 index 0000000..8554fb2 --- /dev/null +++ b/examples/issue-0026.sx @@ -0,0 +1,68 @@ +// issue-0026: Chess game on iOS-sim with `plat.gpu_mode = .metal` crashes +// inside `[MTLTexture replaceRegion:mipmapLevel:withBytes:bytesPerRow:]` +// when uploading the 1024×1024 R8 font atlas. The 1×1 RGBA8 white tex +// through the SAME code path (metal_update_texture_region_ios in +// library/modules/gpu/metal.sx) works. +// +// Blocked on issue-0024 (NSLog inside if/else not firing — or unified-log +// buffer loss on crash; investigation pending) — without a trustworthy +// tracer we can't reliably bisect which arg arrives wrong. Most likely +// cause: this is downstream of issue-0025's ABI gaps (MTLRegion is 48 +// bytes and goes through `xx objc_msgSend` cast, which is the +// call_indirect path that issue-0025 part B covers). +// +// ── Reproduction recipe ─────────────────────────────────────────────────── +// +// cd /Users/agra/projects/game +// /Users/agra/projects/sx/zig-out/bin/sx build --target ios-sim main.sx \ +// --bundle sx-out/ios/SxChess.app --bundle-id co.swipelab.sxchess \ +// -F ~/Library/Frameworks +// cp -R assets sx-out/ios/SxChess.app/ +// codesign --force --sign - --timestamp=none sx-out/ios/SxChess.app +// xcrun simctl install booted sx-out/ios/SxChess.app +// xcrun simctl launch --terminate-running-process booted co.swipelab.sxchess +// sleep 4 && xcrun simctl io booted screenshot /tmp/sx-chess.png +// +// Expected (after fix): chess board renders via Metal. +// Observed: app launches, returns immediately to home screen, no screen +// touched. The simpler examples/63-metal-clear.sx demo still renders the +// colored triangle on the same sim, so the Metal pipeline itself works +// for small uploads. +// +// ── Candidate root causes (in priority order) ───────────────────────────── +// +// 1. issue-0025 fallout (most likely): MTLRegion (48 B by value) passed +// via the *MTLRegion workaround. The call_indirect path (issue-0025 +// part B) doesn't ABI-coerce, so the pointer-shaped declaration may +// not actually pass the address in the right register slot for that +// call site shape (6 args, including the indirect aggregate). +// +// 2. iOS-sim Metal-driver limitation: `setStorageMode:.shared` may not be +// honored for r8 textures of this size; default may be `.private` +// which precludes CPU-side replaceRegion. Workaround would be to +// upload via `MTLBuffer` + `MTLBlitCommandEncoder` (newBufferWithBytes +// + copyFromBuffer:sourceOffset:sourceBytesPerRow:...:toTexture:...). +// +// 3. sx-side `xx` cast bug: bytes_per_row : u64 = xx (u32_expr) may +// truncate or sign-extend incorrectly. Less likely (the math comes +// out to 1024, which fits in any width). +// +// ── How to resolve ──────────────────────────────────────────────────────── +// +// After issues 0024 + 0025 are landed: +// 1. Re-add the trace NSLog markers ("[metal] U1..U5" in +// metal_update_texture_region_ios) — now they should actually print. +// 2. Re-build + relaunch chess on iOS-sim. +// 3. If U5 fires after U4 (no crash inside msg_replace), the bug was +// ABI-related; declare success and rename this file to +// examples/NN-metal-large-region-upload.sx (next free NN). +// 4. If U4 → crash persists, fall back to the MTLBuffer + blit +// encoder path in metal.sx's create_texture (when pixels != null, +// allocate a temporary MTLBuffer with newBufferWithBytes:length:options: +// then run a one-shot command buffer with a MTLBlitCommandEncoder +// copying the buffer into the texture). This is the Apple-recommended +// approach for large texture initial-uploads. + +#import "modules/std.sx"; + +main :: () -> s32 { 0; } diff --git a/examples/issue-0027.sx b/examples/issue-0027.sx new file mode 100644 index 0000000..8e689bf --- /dev/null +++ b/examples/issue-0027.sx @@ -0,0 +1,50 @@ +// issue-0027: Feature — support Obj-C blocks (^{...}) so sx code can call +// APIs that take a block parameter. Required for step 4 of the Metal port +// (keyboard lockstep via `[UIView animateWithDuration:animations:^{...}]`), +// and broadly useful for any UIKit/AppKit API. +// +// ── Proposed surface ────────────────────────────────────────────────────── +// +// Option A — comptime intrinsic that wraps a sx closure as a block: +// +// block := objc_block(@my_closure); // returns *void (an id) +// msg_block(view, sel, 0.3, block); // pass like any id arg +// +// Internals: emit a Block_literal struct constant with the right invoke +// fn pointer, isa, flags, descriptor pointer. Approximately what clang +// generates for ^{...}. +// +// Option B — surface-level syntax `^{ ... }` that lowers to Option A +// automatically. Cleaner for users; more parser work. +// +// Recommended: start with Option A (intrinsic). Migrate to Option B once +// the codegen path is proven. +// +// ── Implementation sketch ──────────────────────────────────────────────── +// +// 1. New `library/modules/std/objc_block.sx` defining the Block_literal +// struct that mirrors clang's layout (isa, flags, reserved, invoke fn +// pointer, descriptor pointer). +// 2. `objc_block(fn_or_closure) -> *void` intrinsic that builds the +// literal at the call site. Initial implementation can be a +// stack-allocated block (_NSConcreteStackBlock); upgrade to +// heap-promoted (_Block_copy) once block lifetime exceeds the call. +// 3. Link libSystem's symbols `_NSConcreteStackBlock` and +// `_NSConcreteGlobalBlock` (auto on iOS; may need `#library "System"` +// on macOS). +// 4. (Deferred) surface syntax `^{ ... }` — parser hook + lowering +// to the intrinsic. Must not clash with bitwise XOR `^`. +// +// ── References ──────────────────────────────────────────────────────────── +// +// - Apple block ABI spec (clang's "Block Implementation Specification") +// - _NSConcreteStackBlock + _NSConcreteGlobalBlock from libSystem +// +// ── Real-world impact ───────────────────────────────────────────────────── +// +// Without this, the keyboard inset cannot be animated in lockstep with the +// keyboard slide. See library/modules/platform/uikit.sx's +// uikit_keyboard_will_change_frame comments for the deferred lockstep work. + +#import "modules/std.sx"; +main :: () -> s32 { 0; } diff --git a/examples/issue-0028.sx b/examples/issue-0028.sx new file mode 100644 index 0000000..1fe8bc4 --- /dev/null +++ b/examples/issue-0028.sx @@ -0,0 +1,53 @@ +// issue-0028: Feature — make protocol boxes assignable to an optional +// type so callers can spell "no GPU bound" as `?GPU = null` instead of +// the verbose `T = ---; has_T: bool` pattern. +// +// ── Current pattern (verbose) ───────────────────────────────────────────── +// +// gpu: GPU = ---; +// has_gpu: bool = false; +// ... +// if self.has_gpu { self.gpu.create_shader(...); } +// +// ── Proposed pattern ────────────────────────────────────────────────────── +// +// gpu: ?GPU = null; +// ... +// if self.gpu != null { self.gpu.create_shader(...); } +// +// ── Where the verbose pattern lives today ───────────────────────────────── +// +// library/modules/ui/renderer.sx — UIRenderer.gpu + has_gpu +// library/modules/ui/glyph_cache.sx — GlyphCache.gpu + has_gpu +// library/modules/ui/pipeline.sx — UIPipeline.gpu + has_gpu (+ set_gpu) +// library/modules/platform/uikit.sx — UIKitPlatform.frame_closure + +// has_frame_closure (Closure type, +// same pattern but on a closure) +// +// ── Implementation sketch ───────────────────────────────────────────────── +// +// Protocol boxes are 2-pointer structs ({vtable, ctx} or {ctx, fn_ptrs...} +// depending on the inline-vs-vtable shape — see src/ir/lower.zig +// `buildProtocolValue` ~7800-7869). `?T` for these can use `vtable_ptr == +// null` (or `ctx == null`, depending on layout choice) as the "none" +// sentinel — no extra storage needed. This matches the existing +// optional-closure handling at src/ir/emit_llvm.zig where `?Closure` uses +// `fn_ptr == null` as none. +// +// Approach: +// 1. Extend `?T` type construction to accept T being a protocol type. +// Files: src/ir/types.zig + src/ir/lower.zig (type-resolution). +// 2. Implement `optional_wrap` / `optional_unwrap` / +// `optional_has_value` for protocol-typed payloads in +// src/ir/emit_llvm.zig — model after the closure-optional path. +// 3. Keep the existing `T = ---; has_T: bool` pattern working — the +// new `?T` is additive, not a replacement. Don't churn existing +// files (uikit.sx's frame_closure pattern stays). +// +// ── Syntax constraint ───────────────────────────────────────────────────── +// +// `?T` syntax already exists for primitives + pointers. Extending to +// protocols is a type-system change; no new surface syntax needed. + +#import "modules/std.sx"; +main :: () -> s32 { 0; } diff --git a/examples/issue-0029.sx b/examples/issue-0029.sx new file mode 100644 index 0000000..c7a5d48 --- /dev/null +++ b/examples/issue-0029.sx @@ -0,0 +1,47 @@ +// issue-0029: Feature — add explicit destructors to the GPU protocol so +// resources can be freed without leaking. +// +// ── Proposed additions to library/modules/gpu/api.sx ────────────────────── +// +// destroy_shader :: (h: ShaderHandle); +// destroy_buffer :: (h: BufferHandle); +// destroy_texture :: (h: TextureHandle); +// +// ── Why ──────────────────────────────────────────────────────────────────── +// +// Today, library/modules/ui/glyph_cache.sx's `grow()` method recreates +// the atlas texture at a larger size but has no way to release the old +// one — see the comment in metal.sx that explicitly notes the leak. The +// GL path uses glDeleteTextures(1, @self.texture_id); the GPU protocol +// has no equivalent yet. +// +// ── Implementation notes ────────────────────────────────────────────────── +// +// Metal backend: send `release` to the MTLTexture / MTLBuffer / +// MTLRenderPipelineState (or call CFRelease, since these are +// CFTypeRef-compatible). Clear the corresponding slot in +// MetalGPU.textures / buffers / shaders to `null` / 0. +// +// GL backend (future): glDeleteTextures / glDeleteBuffers / glDeleteProgram. +// +// Handle lifecycle: after destroy, the slot in the backend List is freed. +// New allocations can take that slot or grow the list. Caller's handles +// remain valid until destroy. Don't aggressively re-use slots in MVP; +// keep handles append-only with a `null` marker for destroyed entries +// (matches the current shape). +// +// ── Touch points ────────────────────────────────────────────────────────── +// +// library/modules/gpu/api.sx — add 3 protocol method signatures +// library/modules/gpu/metal.sx — implement them (release + null +// the slot) +// library/modules/ui/glyph_cache.sx — call destroy_texture(old_handle) +// in grow() before creating the +// new atlas +// +// ── Syntax constraint ───────────────────────────────────────────────────── +// +// None — straight protocol-method addition. + +#import "modules/std.sx"; +main :: () -> s32 { 0; } diff --git a/examples/issue-0030.sx b/examples/issue-0030.sx new file mode 100644 index 0000000..08ee85b --- /dev/null +++ b/examples/issue-0030.sx @@ -0,0 +1,57 @@ +// issue-0030: Feature — support `extern` global declarations so a global +// declared in one sx source file can be referenced from another without +// parameter threading. +// +// ── Use case from the Metal port ────────────────────────────────────────── +// +// // game/main.sx +// g_metal_gpu : *MetalGPU = null; +// +// // game/chess/pieces.sx +// extern g_metal_gpu : *MetalGPU; +// +// load :: (self: *ChessPieces, path: [:0]u8) { +// ... +// inline if OS == .ios { +// tex := g_metal_gpu.create_texture(w, h, .rgba8, xx pixels); +// } else { +// // GL path +// } +// } +// +// Today, pieces.load takes `has_gpu: bool, gpu: GPU` parameters and +// game/main.sx threads them through. Cross-file `extern` globals would +// let us drop those parameters. +// +// ── Implementation sketch ───────────────────────────────────────────────── +// +// Mirror how foreign function declarations work — declared in one file, +// defined elsewhere, linker resolves. Globals already have first-class +// addresses in the IR; just add an "extern" flag that says "don't emit +// storage, emit a reference." +// +// Files: +// - parser (sx surface syntax for `extern G : T;`) +// - src/ir/lower.zig (record an extern global stub that resolves at +// module-link time) +// - src/ir/emit_llvm.zig (emit an `external` LLVM global) +// +// ── Syntax constraint ───────────────────────────────────────────────────── +// +// `extern G : T;` is a NEW top-level form. Must not clash with: +// - `G :: T;` (type alias) +// - `G : T = ---;` (uninitialized global with explicit type) +// - `G : T;` (does this currently parse as anything?) +// +// The parser MUST reject `extern G : T = expr;` — extern cannot have an +// initializer (the definition lives elsewhere). +// +// ── Caveat ──────────────────────────────────────────────────────────────── +// +// Encourages spaghetti globals. Documentation should steer callers toward +// explicit parameter passing where reasonable. Useful for genuine +// process-singletons (the active GPU, the active platform, etc.) where +// threading them through every call site is more noise than signal. + +#import "modules/std.sx"; +main :: () -> s32 { 0; } diff --git a/library/modules/gpu/metal.sx b/library/modules/gpu/metal.sx index 99aa4cf..68e8cec 100644 --- a/library/modules/gpu/metal.sx +++ b/library/modules/gpu/metal.sx @@ -28,6 +28,12 @@ MTL_PIXEL_FORMAT_R8_UNORM :u64: 10; MTL_LOAD_ACTION_CLEAR :u64: 2; MTL_STORE_ACTION_STORE :u64: 1; +// MTLStorageMode. For UI atlases + sprites the CPU needs to write pixels +// and the GPU needs to sample — `.shared` is the safe default. On iOS-sim +// under Apple Silicon the convenience class method's default storage +// isn't reliably shared, so we set it explicitly in metal_create_texture_ios. +MTL_STORAGE_MODE_SHARED :u64: 0; + // MTLPrimitiveType. MTL_PRIMITIVE_TYPE_TRIANGLE :u64: 3; @@ -84,11 +90,18 @@ MetalGPU :: struct { } impl GPU for MetalGPU { + // Two-phase init: callers can `init(null, 0, 0)` first to allocate + // device + queue eagerly (lets the UI pipeline compile shaders before + // UIKit hands us a layer), then re-call `init(layer, w, h)` once the + // CAMetalLayer is available. The second call only updates the layer + // ref + dims; device/queue are preserved. init :: (self: *MetalGPU, target: *void, pixel_w: s32, pixel_h: s32) -> bool { inline if OS != .ios { return false; } - self.layer = target; - self.pixel_w = pixel_w; - self.pixel_h = pixel_h; + if target != null { + self.layer = target; + self.pixel_w = pixel_w; + self.pixel_h = pixel_h; + } metal_init_ios(self); } @@ -200,12 +213,19 @@ impl GPU for MetalGPU { // so non-iOS builds never reference the unresolved Metal symbols below. // ─────────────────────────────────────────────────────────────────────────── +// init() may be called twice: once with target==null to create device + +// queue eagerly (so the UI pipeline can compile shaders before UIKit +// has a layer for us), then again with target=CAMetalLayer once +// `-[SxAppDelegate didFinishLaunching:]` has installed the view. +// Both calls go through this helper; it's idempotent on the device/queue +// and only touches the layer when one's been supplied. metal_init_ios :: (self: *MetalGPU) -> bool { inline if OS != .ios { return false; } - if self.layer == null { return false; } - self.device = MTLCreateSystemDefaultDevice(); - if self.device == null { return false; } + if self.device == null { + self.device = MTLCreateSystemDefaultDevice(); + if self.device == null { return false; } + } msg_oo : (*void, *void, *void) -> void = xx objc_msgSend; msg_ou : (*void, *void, u64) -> void = xx objc_msgSend; @@ -213,15 +233,19 @@ metal_init_ios :: (self: *MetalGPU) -> bool { msg_osize : (*void, *void, CGSize) -> void = xx objc_msgSend; msg_o : (*void, *void) -> *void = xx objc_msgSend; - msg_oo(self.layer, sel_registerName("setDevice:".ptr), self.device); - msg_ou(self.layer, sel_registerName("setPixelFormat:".ptr), MTL_PIXEL_FORMAT_BGRA8_UNORM); - msg_ob(self.layer, sel_registerName("setFramebufferOnly:".ptr), 1); + if self.queue == null { + self.queue = msg_o(self.device, sel_registerName("newCommandQueue".ptr)); + if self.queue == null { return false; } + } - size := CGSize.{ width = xx self.pixel_w, height = xx self.pixel_h }; - msg_osize(self.layer, sel_registerName("setDrawableSize:".ptr), size); + if self.layer != null { + msg_oo(self.layer, sel_registerName("setDevice:".ptr), self.device); + msg_ou(self.layer, sel_registerName("setPixelFormat:".ptr), MTL_PIXEL_FORMAT_BGRA8_UNORM); + msg_ob(self.layer, sel_registerName("setFramebufferOnly:".ptr), 1); - self.queue = msg_o(self.device, sel_registerName("newCommandQueue".ptr)); - if self.queue == null { return false; } + size := CGSize.{ width = xx self.pixel_w, height = xx self.pixel_h }; + msg_osize(self.layer, sel_registerName("setDrawableSize:".ptr), size); + } true; } @@ -457,6 +481,12 @@ metal_create_texture_ios :: (self: *MetalGPU, w: s32, h: s32, format: TextureFor pixel_format, xx w, xx h, 0); if desc == null { return 0; } + // Force shared storage so the CPU can keep writing pixels (atlas updates, + // sprite uploads). On iOS-sim under Apple Silicon the convenience class + // method's default storage isn't reliably shared for every format. + msg_ou_void : (*void, *void, u64) -> void = xx objc_msgSend; + msg_ou_void(desc, sel_registerName("setStorageMode:".ptr), MTL_STORAGE_MODE_SHARED); + msg_oo : (*void, *void, *void) -> *void = xx objc_msgSend; tex := msg_oo(self.device, sel_registerName("newTextureWithDescriptor:".ptr), desc); if tex == null { return 0; } diff --git a/library/modules/ui/glyph_cache.sx b/library/modules/ui/glyph_cache.sx index 1e16e0b..44012f1 100755 --- a/library/modules/ui/glyph_cache.sx +++ b/library/modules/ui/glyph_cache.sx @@ -1,5 +1,7 @@ #import "modules/std.sx"; #import "modules/opengl.sx"; +#import "modules/gpu/types.sx"; +#import "modules/gpu/api.sx"; #import "modules/stb_truetype.sx"; #import "modules/ui/types.sx"; @@ -176,9 +178,20 @@ GlyphCache :: struct { last_shape_len: s64; last_shape_size_q: u16; + // GPU protocol backend. When `has_gpu`, atlas creation + dirty uploads + // route through `gpu` instead of raw GL. + gpu: GPU = ---; + has_gpu: bool = false; + init :: (self: *GlyphCache, path: [:0]u8, default_size: f32) { + // Preserve any pre-set GPU dispatch across the zero-out — the + // surrounding struct memset would otherwise wipe it. + saved_gpu := self.gpu; + saved_has_gpu := self.has_gpu; // Zero out the entire struct first (parent may be uninitialized with = ---) memset(self, 0, size_of(GlyphCache)); + self.gpu = saved_gpu; + self.has_gpu = saved_has_gpu; // Load font file file_size : s32 = 0; @@ -245,15 +258,25 @@ GlyphCache :: struct { val_bytes : s64 = self.hash_cap * 8; // s64 per slot (s32 would suffice but alignment) self.hash_vals = xx context.allocator.alloc(val_bytes); - // Create OpenGL texture - glGenTextures(1, @self.texture_id); - glBindTexture(GL_TEXTURE_2D, self.texture_id); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, self.atlas_width, self.atlas_height, 0, GL_RED, GL_UNSIGNED_BYTE, self.bitmap); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE); + // Create the atlas texture. In GPU-protocol mode we create empty and + // let the first `flush()` push the (zero-initialized) bitmap via + // update_texture_region — same result as the GL path's glTexImage2D + // with the zeroed bitmap, but works whether or not the backend + // accepts CPU pixel pointers at create time. + if self.has_gpu { + self.texture_id = self.gpu.create_texture( + self.atlas_width, self.atlas_height, .r8, null); + self.dirty = true; + } else { + glGenTextures(1, @self.texture_id); + glBindTexture(GL_TEXTURE_2D, self.texture_id); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, self.atlas_width, self.atlas_height, 0, GL_RED, GL_UNSIGNED_BYTE, self.bitmap); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE); + } out("GlyphCache initialized: "); out(path); @@ -406,9 +429,14 @@ GlyphCache :: struct { // Upload dirty atlas to GPU flush :: (self: *GlyphCache) { if self.dirty == false { return; } - glBindTexture(GL_TEXTURE_2D, self.texture_id); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.atlas_width, self.atlas_height, GL_RED, GL_UNSIGNED_BYTE, self.bitmap); + if self.has_gpu { + self.gpu.update_texture_region(self.texture_id, 0, 0, + self.atlas_width, self.atlas_height, xx self.bitmap); + } else { + glBindTexture(GL_TEXTURE_2D, self.texture_id); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.atlas_width, self.atlas_height, GL_RED, GL_UNSIGNED_BYTE, self.bitmap); + } self.dirty = false; } @@ -464,16 +492,23 @@ GlyphCache :: struct { self.atlas_width = new_w; self.atlas_height = new_h; - // Recreate GL texture - glDeleteTextures(1, @self.texture_id); - glGenTextures(1, @self.texture_id); - glBindTexture(GL_TEXTURE_2D, self.texture_id); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, new_w, new_h, 0, GL_RED, GL_UNSIGNED_BYTE, new_bitmap); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE); + // Recreate atlas at the new size. + if self.has_gpu { + // No destroy_texture in the GPU protocol yet — old atlas + // leaks in the backend table until process exit. Atlas grow + // is rare so this is acceptable for now. + self.texture_id = self.gpu.create_texture(new_w, new_h, .r8, xx new_bitmap); + } else { + glDeleteTextures(1, @self.texture_id); + glGenTextures(1, @self.texture_id); + glBindTexture(GL_TEXTURE_2D, self.texture_id); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, new_w, new_h, 0, GL_RED, GL_UNSIGNED_BYTE, new_bitmap); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE); + } // Recompute UV coordinates for all cached glyphs atlas_wf : f32 = xx new_w; diff --git a/library/modules/ui/pipeline.sx b/library/modules/ui/pipeline.sx index 9a10e46..8271fe9 100755 --- a/library/modules/ui/pipeline.sx +++ b/library/modules/ui/pipeline.sx @@ -1,6 +1,7 @@ #import "modules/std.sx"; #import "modules/allocators.sx"; #import "modules/opengl.sx"; +#import "modules/gpu/api.sx"; #import "modules/ui/types.sx"; #import "modules/ui/render.sx"; #import "modules/ui/events.sx"; @@ -24,6 +25,23 @@ UIPipeline :: struct { has_body: bool; parent_allocator: Allocator; + // GPU protocol backend. When `has_gpu`, the pipeline propagates this + // to its renderer + font, and skips the per-frame GL state setup in + // commit_gpu (Metal bakes blend mode into the pipeline state). + gpu: GPU = ---; + has_gpu: bool = false; + + // Set the GPU dispatch BEFORE calling init() / init_font() so the + // shaders + atlas land on the right backend. + set_gpu :: (self: *UIPipeline, gpu: GPU) { + self.gpu = gpu; + self.has_gpu = true; + self.renderer.gpu = gpu; + self.renderer.has_gpu = true; + self.font.gpu = gpu; + self.font.has_gpu = true; + } + init :: (self: *UIPipeline, width: f32, height: f32) { self.render_tree = RenderTree.init(); self.renderer.init(); @@ -149,14 +167,18 @@ UIPipeline :: struct { } commit_gpu :: (self: *UIPipeline) { - glEnable(GL_BLEND); - glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); - glDisable(GL_DEPTH_TEST); + if !self.has_gpu { + glEnable(GL_BLEND); + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + glDisable(GL_DEPTH_TEST); + } self.renderer.begin(self.screen_width, self.screen_height, self.font.texture_id); self.renderer.process(@self.render_tree); self.renderer.flush(); - glDisable(GL_BLEND); + if !self.has_gpu { + glDisable(GL_BLEND); + } } } diff --git a/library/modules/ui/renderer.sx b/library/modules/ui/renderer.sx index ab92429..9c7646d 100755 --- a/library/modules/ui/renderer.sx +++ b/library/modules/ui/renderer.sx @@ -2,6 +2,8 @@ #import "modules/compiler.sx"; #import "modules/opengl.sx"; #import "modules/math"; +#import "modules/gpu/types.sx"; +#import "modules/gpu/api.sx"; #import "modules/ui/types.sx"; #import "modules/ui/render.sx"; #import "modules/ui/glyph_cache.sx"; @@ -13,62 +15,81 @@ UI_VERTEX_BYTES :s64: 48; MAX_UI_VERTICES :s64: 16384; UIRenderer :: struct { + // GL-side handles. Used when `gpu == null` (every non-iOS target today). vao: u32; vbo: u32; shader: u32; proj_loc: s32; tex_loc: s32; + + // CPU-side vertex scratch buffer — same for both backends. vertices: [*]f32; vertex_count: s64; screen_width: f32; screen_height: f32; dpi_scale: f32; - white_texture: u32; + white_texture: u32; // GL name OR TextureHandle (both are u32-shaped) current_texture: u32; draw_calls: s64; - init :: (self: *UIRenderer) { - // Create shader (ES for WASM/WebGL2 + iOS GLES3, Core for desktop GL 3.3) - inline if OS == .wasm or OS == .ios { - self.shader = create_program(UI_VERT_SRC_ES, UI_FRAG_SRC_ES); - } else { - self.shader = create_program(UI_VERT_SRC_CORE, UI_FRAG_SRC_CORE); - } - self.proj_loc = glGetUniformLocation(self.shader, "uProj"); - self.tex_loc = glGetUniformLocation(self.shader, "uTex"); + // GPU protocol backend. When `has_gpu`, the renderer routes shader / + // buffer / texture / draw calls through `gpu` instead of raw GL. The + // chess game sets this on iOS to a boxed `*MetalGPU`. + gpu: GPU = ---; + has_gpu: bool = false; + mtl_shader: ShaderHandle = 0; + mtl_vbuf: BufferHandle = 0; - // Allocate vertex buffer (CPU side) + init :: (self: *UIRenderer) { + // Allocate vertex scratch (CPU side) — same for both backends. buf_size := MAX_UI_VERTICES * UI_VERTEX_BYTES; self.vertices = xx context.allocator.alloc(buf_size); memset(self.vertices, 0, buf_size); self.vertex_count = 0; - - // Create VAO/VBO - glGenVertexArrays(1, @self.vao); - glGenBuffers(1, @self.vbo); - glBindVertexArray(self.vao); - glBindBuffer(GL_ARRAY_BUFFER, self.vbo); - glBufferData(GL_ARRAY_BUFFER, xx buf_size, null, GL_DYNAMIC_DRAW); - - // pos (2 floats) - glVertexAttribPointer(0, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 0); - glEnableVertexAttribArray(0); - // uv (2 floats) - glVertexAttribPointer(1, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 8); - glEnableVertexAttribArray(1); - // color (4 floats) - glVertexAttribPointer(2, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 16); - glEnableVertexAttribArray(2); - // params: corner_radius, border_width, rect_w, rect_h - glVertexAttribPointer(3, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 32); - glEnableVertexAttribArray(3); - - glBindVertexArray(0); - self.dpi_scale = 1.0; - // 1x1 white texture for solid rects - self.white_texture = create_white_texture(); + if self.has_gpu { + // ── Metal backend (via GPU protocol) ─────────────────────── + self.mtl_shader = self.gpu.create_shader(UI_MSL_SRC, ""); + self.mtl_vbuf = self.gpu.create_buffer(buf_size); + white_px : [4]u8 = .[255, 255, 255, 255]; + self.white_texture = self.gpu.create_texture(1, 1, .rgba8, xx @white_px[0]); + } else { + // ── GL backend ───────────────────────────────────────────── + // Create shader (ES for WASM/WebGL2 + iOS GLES3, Core for desktop GL 3.3) + inline if OS == .wasm or OS == .ios { + self.shader = create_program(UI_VERT_SRC_ES, UI_FRAG_SRC_ES); + } else { + self.shader = create_program(UI_VERT_SRC_CORE, UI_FRAG_SRC_CORE); + } + self.proj_loc = glGetUniformLocation(self.shader, "uProj"); + self.tex_loc = glGetUniformLocation(self.shader, "uTex"); + + // Create VAO/VBO + glGenVertexArrays(1, @self.vao); + glGenBuffers(1, @self.vbo); + glBindVertexArray(self.vao); + glBindBuffer(GL_ARRAY_BUFFER, self.vbo); + glBufferData(GL_ARRAY_BUFFER, xx buf_size, null, GL_DYNAMIC_DRAW); + + // pos (2 floats) + glVertexAttribPointer(0, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 0); + glEnableVertexAttribArray(0); + // uv (2 floats) + glVertexAttribPointer(1, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 8); + glEnableVertexAttribArray(1); + // color (4 floats) + glVertexAttribPointer(2, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 16); + glEnableVertexAttribArray(2); + // params: corner_radius, border_width, rect_w, rect_h + glVertexAttribPointer(3, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 32); + glEnableVertexAttribArray(3); + + glBindVertexArray(0); + + // 1x1 white texture for solid rects + self.white_texture = create_white_texture(); + } } begin :: (self: *UIRenderer, width: f32, height: f32, font_texture: u32) { @@ -78,15 +99,26 @@ UIRenderer :: struct { self.current_texture = font_texture; self.draw_calls = 0; - // Set up GL state once for the entire frame - glUseProgram(self.shader); proj := Mat4.ortho(0.0, width, height, 0.0, -1.0, 1.0); - glUniformMatrix4fv(self.proj_loc, 1, 0, proj.data); - glUniform1i(self.tex_loc, 0); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, font_texture); - glBindVertexArray(self.vao); - glBindBuffer(GL_ARRAY_BUFFER, self.vbo); + + if self.has_gpu { + // Pipeline state + vertex buffer + projection + initial texture. + // Metal blend mode + scissor-cleared defaults are baked into + // the pipeline state, so no per-frame glEnable/glDisable. + self.gpu.set_shader(self.mtl_shader); + self.gpu.set_vertex_buffer(self.mtl_vbuf); + self.gpu.set_vertex_constants(1, xx proj.data, 64); + self.gpu.set_texture(0, font_texture); + } else { + // GL: bind everything for the frame. + glUseProgram(self.shader); + glUniformMatrix4fv(self.proj_loc, 1, 0, proj.data); + glUniform1i(self.tex_loc, 0); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, font_texture); + glBindVertexArray(self.vao); + glBindBuffer(GL_ARRAY_BUFFER, self.vbo); + } } bind_texture :: (self: *UIRenderer, tex: u32) { @@ -202,18 +234,33 @@ UIRenderer :: struct { } case .clip_push: { self.flush(); - glEnable(GL_SCISSOR_TEST); dpi := self.dpi_scale; - glScissor( - xx (node.frame.origin.x * dpi), - xx ((self.screen_height - node.frame.origin.y - node.frame.size.height) * dpi), - xx (node.frame.size.width * dpi), - xx (node.frame.size.height * dpi) - ); + if self.has_gpu { + // Metal: pixel coords, top-left origin (no Y flip). + self.gpu.set_scissor( + xx (node.frame.origin.x * dpi), + xx (node.frame.origin.y * dpi), + xx (node.frame.size.width * dpi), + xx (node.frame.size.height * dpi), + ); + } else { + // GL: pixel coords, bottom-left origin — flip Y. + glEnable(GL_SCISSOR_TEST); + glScissor( + xx (node.frame.origin.x * dpi), + xx ((self.screen_height - node.frame.origin.y - node.frame.size.height) * dpi), + xx (node.frame.size.width * dpi), + xx (node.frame.size.height * dpi) + ); + } } case .clip_pop: { self.flush(); - glDisable(GL_SCISSOR_TEST); + if self.has_gpu { + self.gpu.disable_scissor(); + } else { + glDisable(GL_SCISSOR_TEST); + } } case .opacity_push: {} case .opacity_pop: {} @@ -225,13 +272,22 @@ UIRenderer :: struct { flush :: (self: *UIRenderer) { if self.vertex_count == 0 { return; } - // Only bind the current texture (program, projection, VAO already bound in begin()) - glBindTexture(GL_TEXTURE_2D, self.current_texture); - upload_size : s64 = self.vertex_count * UI_VERTEX_BYTES; - // Use glBufferData to orphan the old buffer and avoid GPU sync stalls - glBufferData(GL_ARRAY_BUFFER, xx upload_size, self.vertices, GL_DYNAMIC_DRAW); - glDrawArrays(GL_TRIANGLES, 0, xx self.vertex_count); + + if self.has_gpu { + // Mirror the GL path: bind current texture before drawing. + // current_texture may have changed since the last flush. + self.gpu.set_texture(0, self.current_texture); + self.gpu.update_buffer(self.mtl_vbuf, xx self.vertices, upload_size); + self.gpu.draw_triangles(0, xx self.vertex_count); + } else { + // Only re-bind the current texture (program, projection, VAO + // already bound in begin()). glBufferData orphans the old buffer + // to avoid GPU sync stalls. + glBindTexture(GL_TEXTURE_2D, self.current_texture); + glBufferData(GL_ARRAY_BUFFER, xx upload_size, self.vertices, GL_DYNAMIC_DRAW); + glDrawArrays(GL_TRIANGLES, 0, xx self.vertex_count); + } self.vertex_count = 0; self.draw_calls += 1; @@ -458,3 +514,87 @@ void main() { } } GLSL; + +// --- Metal (MSL) — single library with vmain/fmain entry points --- +// +// `packed_float2 / packed_float4` keep the 12-float interleaved vertex +// layout (pos2 / uv2 / color4 / params4 = 48 bytes) without padding — +// MSL's default `float4` has 16-byte alignment and would force a 64-byte +// struct (see examples/63-metal-clear.sx for the gotcha). +// +// Uniform passing: GL uses `glUniformMatrix4fv("uProj", proj)`; Metal +// receives the projection via `setVertexBytes:length:atIndex:1` (slot 0 +// is the vertex buffer). Texture binding goes through +// `setFragmentTexture:atIndex:0`. + +UI_MSL_SRC :: #string MSL +#include +using namespace metal; + +struct UIVertex { + packed_float2 pos; + packed_float2 uv; + packed_float4 color; + packed_float4 params; +}; + +struct VOut { + float4 position [[position]]; + float2 uv; + float4 color; + float4 params; +}; + +vertex VOut vmain(uint vid [[vertex_id]], + constant UIVertex* verts [[buffer(0)]], + constant float4x4& proj [[buffer(1)]]) { + UIVertex v = verts[vid]; + VOut o; + o.position = proj * float4(v.pos, 0.0, 1.0); + o.uv = float2(v.uv); + o.color = float4(v.color); + o.params = float4(v.params); + return o; +} + +static float roundedBoxSDF(float2 center, float2 half_size, float radius) { + float2 q = abs(center) - half_size + float2(radius); + return length(max(q, float2(0.0))) + min(max(q.x, q.y), 0.0) - radius; +} + +fragment float4 fmain(VOut in [[stage_in]], + texture2d tex [[texture(0)]]) { + constexpr sampler s(coord::normalized, address::clamp_to_edge, filter::linear); + + float mode = in.params.x; + float border = in.params.y; + float2 rectSize = in.params.zw; + + if (mode < -1.5) { + // Image mode (mode == -2.0): sample texture + return tex.sample(s, in.uv) * in.color; + } else if (mode < 0.0) { + // Text mode (mode == -1.0): sample glyph atlas .r as alpha + float alpha = tex.sample(s, in.uv).r; + float ew = fwidth(alpha) * 0.7; + alpha = smoothstep(0.5 - ew, 0.5 + ew, alpha); + return float4(in.color.rgb, in.color.a * pow(alpha, 0.9)); + } else if (mode > 0.0 || border > 0.0) { + // Rounded rect: SDF alpha, vertex color only + float2 half_size = rectSize * 0.5; + float2 center = (in.uv - float2(0.5)) * rectSize; + float dist = roundedBoxSDF(center, half_size, mode); + float aa = fwidth(dist); + float alpha = 1.0 - smoothstep(-aa, aa, dist); + if (border > 0.0) { + float inner = roundedBoxSDF(center, half_size - float2(border), max(mode - border, 0.0)); + float border_alpha = smoothstep(-aa, aa, inner); + alpha = alpha * max(border_alpha, 0.0); + } + return float4(in.color.rgb, in.color.a * alpha); + } else { + // Plain rect: vertex color only + return in.color; + } +} +MSL;