diff --git a/examples/issue-0024.sx b/examples/issue-0024.sx
new file mode 100644
index 0000000..da7f3c1
--- /dev/null
+++ b/examples/issue-0024.sx
@@ -0,0 +1,90 @@
+// issue-0024: NSLog/foreign-side-effect calls placed as the FIRST statement
+// of an `if X { ... } else { ... }` branch body do not produce visible
+// output, even when the branch is provably taken (the SECOND statement in
+// the same body — also a foreign call — does produce output).
+//
+// ── Observed iOS-side symptom (session 59 bisect) ─────────────────────────
+//
+// In library/modules/gpu/metal.sx's `metal_create_texture_ios`:
+//
+//   slot : TextureSlot = .{ tex = tex, bytes_per_pixel = bytes_per_pixel };
+//   self.textures.append(slot);
+//   NSLog(ns_string("[metal] T6 appended\n".ptr));      // ← fires
+//
+//   pixels_null := pixels == null;
+//   if pixels_null {
+//       NSLog(ns_string("[metal] T6b null\n".ptr));     // ← never fires
+//   } else {
+//       NSLog(ns_string("[metal] T6a non-null\n".ptr)); // ← never fires
+//       handle : u32 = xx self.textures.len;
+//       metal_update_texture_region_ios(self, handle, 0, 0, w, h, pixels);
+//                                                       //  ← DOES fire
+//                                                       //    (its first
+//                                                       //    NSLog at
+//                                                       //    fn entry
+//                                                       //    appears in
+//                                                       //    the unified
+//                                                       //    log)
+//       NSLog(ns_string("[metal] T7 done\n".ptr));      // ← (helper crashed
+//                                                       //    before this)
+//   }
+//
+// T6 appears in the iOS unified log. T6a/T6b never appear. The else
+// branch's helper call DOES fire (its own first-statement NSLog inside
+// the helper appears). So the else-branch IS entered; just its first
+// NSLog statement produces no output.
+//
+// ── Pure-sx repro below does NOT trigger ───────────────────────────────────
+//
+// Running `sx run examples/issue-0024.sx` exits 0 (counter == 4 — all
+// bumps fired). The bug only manifests with foreign calls (NSLog / ns_string),
+// and possibly only when the process subsequently crashes (replaceRegion
+// in the metal.sx case) — which raises the alternative hypothesis that
+// the missing NSLog output is just iOS unified-logging buffer-loss on
+// process death, not a sx compiler bug. The runtime sequence between T6
+// and the crash was ~500μs; logs within ~1ms of an unhandled exception
+// can be lost to OSLog's internal buffering on Apple Silicon iOS-sim.
+//
+// ── Investigation plan ─────────────────────────────────────────────────────
+//
+// Two paths to disambiguate:
+//   1. Replace NSLog markers with `write(STDERR_FILENO, ...)` calls
+//      (synchronous, no OSLog involvement). If markers still don't appear:
+//      sx compiler bug — likely in src/ir/lower.zig:2166-2196 (the
+//      `is_value` branch of `lowerIfExpr` and downstream `lowerBlockValue`
+//      around 922-948). Possible: side-effecting leading statements
+//      dropped when branches are treated as values.
+//   2. If markers DO appear with synchronous write: the iOS-side symptom
+//      is unified-logging buffer-loss, not a compiler bug. Close this issue
+//      as "wontfix — diagnostic limitation" and move the iOS debugging to
+//      foreign-write tracing.
+//
+// ── Real-world impact ──────────────────────────────────────────────────────
+//
+// Bisecting issue-0026 (replaceRegion crash) is currently blocked: without
+// trustworthy markers inside if/else branches we can't tell which arg
+// arrives wrong. Resolution unblocks step 3b of the Metal port.
+
+#import "modules/std.sx";
+
+counter : s64 = 0;
+
+bump :: () { counter = counter + 1; }
+
+probe :: (skip: bool) {
+    bump();
+    if skip {
+        bump();
+        bump();
+    } else {
+        bump();
+        bump();
+    }
+    bump();
+}
+
+main :: () -> s32 {
+    probe(false);
+    // counter == 4 (entry + 2 in false branch + exit) → exit 0
+    if counter == 4 then 0 else 1;
+}
diff --git a/examples/issue-0025.sx b/examples/issue-0025.sx
new file mode 100644
index 0000000..6df5a4e
--- /dev/null
+++ b/examples/issue-0025.sx
@@ -0,0 +1,94 @@
+// issue-0025: Composite types larger than 16 bytes are passed without the
+// LLVM `byval(<ty>)` attribute, and the `call_indirect` (fn-pointer cast)
+// path doesn't apply C-ABI parameter coercion at all. Both gaps cause
+// silent shape-mismatch when sx code calls foreign C functions that take
+// large aggregates by value, OR when sx code calls a sx fn through a
+// fn-pointer typed with a large-aggregate parameter.
+//
+// ── Two failing forms ─────────────────────────────────────────────────────
+//
+// (A) Direct call to a sx function with a >16B param:
+//
+//     Wide :: struct { a: s64; b: s64; c: s64; d: s64; }   // 32 bytes
+//     accept :: (w: Wide) -> s64 { w.a + w.b + w.c + w.d; }
+//     accept(Wide.{ a = 1, b = 10, c = 100, d = 1000 })   // expect 1111
+//
+//   src/ir/emit_llvm.zig:2747-2795 (`abiCoerceParamType`):
+//     - <=8 bytes  → coerced to i64
+//     - 9-16 bytes → coerced to [2 x i64]
+//     - >16 bytes  → returns llvm_ty unchanged with TODO at line 2793
+//
+//   The TODO is the bug: large composites should be coerced to `ptr`
+//   with a `byval(struct.T)` LLVM attribute. LLVM's mid-end then
+//   materializes the right machine code per target. Today the struct
+//   is left as-is, which LLVM tries to pass across registers + stack
+//   slots in ways that don't match the C ABI promise.
+//
+// (B) Indirect call via fn-pointer cast (the `xx objc_msgSend` idiom):
+//
+//     fn_ptr : (Wide) -> s64 = xx accept;
+//     fn_ptr(Wide.{ ... })
+//
+//   src/ir/emit_llvm.zig:902-967 (`.call_indirect`): both the
+//   FunctionInfo-known arm (939-952) and the LLVMTypeOf-fallback arm
+//   (953-956) construct `param_tys[j]` WITHOUT routing through
+//   `abiCoerceParamType`. So even if (A) is fixed, fn-pointer-cast call
+//   sites still mis-marshal large composites.
+//
+// ── Real-world impact ──────────────────────────────────────────────────────
+//
+// Every `xx objc_msgSend` call site in library/modules/platform/uikit.sx
+// + library/modules/gpu/metal.sx. Works in practice today only because:
+//   - We never pass aggregates >16 bytes by value through fn-pointer casts
+//     (workaround: declare param as `*T` + pass `@local`; arm64 AAPCS's
+//     indirect-by-ref happens to match this machine-state-wise).
+//   - HFAs (CGSize 2×f64, MTLClearColor 4×f64, CGRect 4×f64 as return)
+//     are correctly classified at emit_llvm.zig:2766-2779.
+//
+// ── Workarounds in use ─────────────────────────────────────────────────────
+//
+// library/modules/gpu/metal.sx declares MTLRegion (48B) + MTLScissorRect
+// (32B) call sites with `*MTLRegion` / `*MTLScissorRect` and passes
+// `@region` / `@rect`. Should not be needed once this issue is fixed.
+//
+// ── Fix sketch ─────────────────────────────────────────────────────────────
+//
+// (A) emit_llvm.zig:2793 — return `ptr` and emit `byval(struct.T)` on
+//     the param via `LLVMAddCallSiteAttribute` / `LLVMCreateTypeAttribute`.
+//     At call sites, alloca + memcpy + pass the alloca pointer. Apply
+//     identically at function-definition emission so direct calls roundtrip.
+//
+// (B) emit_llvm.zig:902-967 — factor out a helper
+//     `coerceCallParams(param_count, src_args, dst_fn_param_tys)
+//        -> (coerced_args, coerced_tys)` that wraps `abiCoerceParamType`.
+//     Use the helper from both arms.
+//
+// Edge cases to preserve:
+//   - Variadic foreign functions (printf family) — variadic tail per
+//     AAPCS64 still passes composites in their natural form. Keep
+//     existing behavior for variadic args.
+//   - HFAs already handled at 2766-2779 — don't touch.
+//   - Structs <=8 bytes coerced to `i64`, 9-16 bytes to `[2 x i64]` —
+//     don't touch.
+
+#import "modules/std.sx";
+
+Wide :: struct {
+    a: s64; b: s64; c: s64; d: s64;
+}
+
+accept :: (w: Wide) -> s64 {
+    w.a + w.b + w.c + w.d;
+}
+
+main :: () -> s32 {
+    w := Wide.{ a = 1, b = 10, c = 100, d = 1000 };
+    direct := accept(w);             // exercises path (A)
+    if direct != 1111 { return 1; }
+
+    fn_ptr : (Wide) -> s64 = xx accept;
+    indirect := fn_ptr(w);           // exercises path (B)
+    if indirect != 1111 { return 2; }
+
+    0;
+}
diff --git a/examples/issue-0026.sx b/examples/issue-0026.sx
new file mode 100644
index 0000000..8554fb2
--- /dev/null
+++ b/examples/issue-0026.sx
@@ -0,0 +1,68 @@
+// issue-0026: Chess game on iOS-sim with `plat.gpu_mode = .metal` crashes
+// inside `[MTLTexture replaceRegion:mipmapLevel:withBytes:bytesPerRow:]`
+// when uploading the 1024×1024 R8 font atlas. The 1×1 RGBA8 white tex
+// through the SAME code path (metal_update_texture_region_ios in
+// library/modules/gpu/metal.sx) works.
+//
+// Blocked on issue-0024 (NSLog inside if/else not firing — or unified-log
+// buffer loss on crash; investigation pending) — without a trustworthy
+// tracer we can't reliably bisect which arg arrives wrong. Most likely
+// cause: this is downstream of issue-0025's ABI gaps (MTLRegion is 48
+// bytes and goes through `xx objc_msgSend` cast, which is the
+// call_indirect path that issue-0025 part B covers).
+//
+// ── Reproduction recipe ───────────────────────────────────────────────────
+//
+//   cd /Users/agra/projects/game
+//   /Users/agra/projects/sx/zig-out/bin/sx build --target ios-sim main.sx \
+//     --bundle sx-out/ios/SxChess.app --bundle-id co.swipelab.sxchess \
+//     -F ~/Library/Frameworks
+//   cp -R assets sx-out/ios/SxChess.app/
+//   codesign --force --sign - --timestamp=none sx-out/ios/SxChess.app
+//   xcrun simctl install booted sx-out/ios/SxChess.app
+//   xcrun simctl launch --terminate-running-process booted co.swipelab.sxchess
+//   sleep 4 && xcrun simctl io booted screenshot /tmp/sx-chess.png
+//
+// Expected (after fix): chess board renders via Metal.
+// Observed: app launches, returns immediately to home screen, no screen
+// touched. The simpler examples/63-metal-clear.sx demo still renders the
+// colored triangle on the same sim, so the Metal pipeline itself works
+// for small uploads.
+//
+// ── Candidate root causes (in priority order) ─────────────────────────────
+//
+// 1. issue-0025 fallout (most likely): MTLRegion (48 B by value) passed
+//    via the *MTLRegion workaround. The call_indirect path (issue-0025
+//    part B) doesn't ABI-coerce, so the pointer-shaped declaration may
+//    not actually pass the address in the right register slot for that
+//    call site shape (6 args, including the indirect aggregate).
+//
+// 2. iOS-sim Metal-driver limitation: `setStorageMode:.shared` may not be
+//    honored for r8 textures of this size; default may be `.private`
+//    which precludes CPU-side replaceRegion. Workaround would be to
+//    upload via `MTLBuffer` + `MTLBlitCommandEncoder` (newBufferWithBytes
+//    + copyFromBuffer:sourceOffset:sourceBytesPerRow:...:toTexture:...).
+//
+// 3. sx-side `xx` cast bug: bytes_per_row : u64 = xx (u32_expr) may
+//    truncate or sign-extend incorrectly. Less likely (the math comes
+//    out to 1024, which fits in any width).
+//
+// ── How to resolve ────────────────────────────────────────────────────────
+//
+// After issues 0024 + 0025 are landed:
+//   1. Re-add the trace NSLog markers ("[metal] U1..U5" in
+//      metal_update_texture_region_ios) — now they should actually print.
+//   2. Re-build + relaunch chess on iOS-sim.
+//   3. If U5 fires after U4 (no crash inside msg_replace), the bug was
+//      ABI-related; declare success and rename this file to
+//      examples/NN-metal-large-region-upload.sx (next free NN).
+//   4. If U4 → crash persists, fall back to the MTLBuffer + blit
+//      encoder path in metal.sx's create_texture (when pixels != null,
+//      allocate a temporary MTLBuffer with newBufferWithBytes:length:options:
+//      then run a one-shot command buffer with a MTLBlitCommandEncoder
+//      copying the buffer into the texture). This is the Apple-recommended
+//      approach for large texture initial-uploads.
+
+#import "modules/std.sx";
+
+main :: () -> s32 { 0; }
diff --git a/examples/issue-0027.sx b/examples/issue-0027.sx
new file mode 100644
index 0000000..8e689bf
--- /dev/null
+++ b/examples/issue-0027.sx
@@ -0,0 +1,50 @@
+// issue-0027: Feature — support Obj-C blocks (^{...}) so sx code can call
+// APIs that take a block parameter. Required for step 4 of the Metal port
+// (keyboard lockstep via `[UIView animateWithDuration:animations:^{...}]`),
+// and broadly useful for any UIKit/AppKit API.
+//
+// ── Proposed surface ──────────────────────────────────────────────────────
+//
+// Option A — comptime intrinsic that wraps a sx closure as a block:
+//
+//   block := objc_block(@my_closure);    // returns *void (an id<Block>)
+//   msg_block(view, sel, 0.3, block);    // pass like any id arg
+//
+//   Internals: emit a Block_literal struct constant with the right invoke
+//   fn pointer, isa, flags, descriptor pointer. Approximately what clang
+//   generates for ^{...}.
+//
+// Option B — surface-level syntax `^{ ... }` that lowers to Option A
+//   automatically. Cleaner for users; more parser work.
+//
+// Recommended: start with Option A (intrinsic). Migrate to Option B once
+// the codegen path is proven.
+//
+// ── Implementation sketch ────────────────────────────────────────────────
+//
+// 1. New `library/modules/std/objc_block.sx` defining the Block_literal
+//    struct that mirrors clang's layout (isa, flags, reserved, invoke fn
+//    pointer, descriptor pointer).
+// 2. `objc_block(fn_or_closure) -> *void` intrinsic that builds the
+//    literal at the call site. Initial implementation can be a
+//    stack-allocated block (_NSConcreteStackBlock); upgrade to
+//    heap-promoted (_Block_copy) once block lifetime exceeds the call.
+// 3. Link libSystem's symbols `_NSConcreteStackBlock` and
+//    `_NSConcreteGlobalBlock` (auto on iOS; may need `#library "System"`
+//    on macOS).
+// 4. (Deferred) surface syntax `^{ ... }` — parser hook + lowering
+//    to the intrinsic. Must not clash with bitwise XOR `^`.
+//
+// ── References ────────────────────────────────────────────────────────────
+//
+// - Apple block ABI spec (clang's "Block Implementation Specification")
+// - _NSConcreteStackBlock + _NSConcreteGlobalBlock from libSystem
+//
+// ── Real-world impact ─────────────────────────────────────────────────────
+//
+// Without this, the keyboard inset cannot be animated in lockstep with the
+// keyboard slide. See library/modules/platform/uikit.sx's
+// uikit_keyboard_will_change_frame comments for the deferred lockstep work.
+
+#import "modules/std.sx";
+main :: () -> s32 { 0; }
diff --git a/examples/issue-0028.sx b/examples/issue-0028.sx
new file mode 100644
index 0000000..1fe8bc4
--- /dev/null
+++ b/examples/issue-0028.sx
@@ -0,0 +1,53 @@
+// issue-0028: Feature — make protocol boxes assignable to an optional
+// type so callers can spell "no GPU bound" as `?GPU = null` instead of
+// the verbose `T = ---; has_T: bool` pattern.
+//
+// ── Current pattern (verbose) ─────────────────────────────────────────────
+//
+//   gpu: GPU = ---;
+//   has_gpu: bool = false;
+//   ...
+//   if self.has_gpu { self.gpu.create_shader(...); }
+//
+// ── Proposed pattern ──────────────────────────────────────────────────────
+//
+//   gpu: ?GPU = null;
+//   ...
+//   if self.gpu != null { self.gpu.create_shader(...); }
+//
+// ── Where the verbose pattern lives today ─────────────────────────────────
+//
+// library/modules/ui/renderer.sx     — UIRenderer.gpu + has_gpu
+// library/modules/ui/glyph_cache.sx  — GlyphCache.gpu + has_gpu
+// library/modules/ui/pipeline.sx     — UIPipeline.gpu + has_gpu (+ set_gpu)
+// library/modules/platform/uikit.sx  — UIKitPlatform.frame_closure +
+//                                       has_frame_closure (Closure type,
+//                                       same pattern but on a closure)
+//
+// ── Implementation sketch ─────────────────────────────────────────────────
+//
+// Protocol boxes are 2-pointer structs ({vtable, ctx} or {ctx, fn_ptrs...}
+// depending on the inline-vs-vtable shape — see src/ir/lower.zig
+// `buildProtocolValue` ~7800-7869). `?T` for these can use `vtable_ptr ==
+// null` (or `ctx == null`, depending on layout choice) as the "none"
+// sentinel — no extra storage needed. This matches the existing
+// optional-closure handling at src/ir/emit_llvm.zig where `?Closure` uses
+// `fn_ptr == null` as none.
+//
+// Approach:
+//   1. Extend `?T` type construction to accept T being a protocol type.
+//      Files: src/ir/types.zig + src/ir/lower.zig (type-resolution).
+//   2. Implement `optional_wrap` / `optional_unwrap` /
+//      `optional_has_value` for protocol-typed payloads in
+//      src/ir/emit_llvm.zig — model after the closure-optional path.
+//   3. Keep the existing `T = ---; has_T: bool` pattern working — the
+//      new `?T` is additive, not a replacement. Don't churn existing
+//      files (uikit.sx's frame_closure pattern stays).
+//
+// ── Syntax constraint ─────────────────────────────────────────────────────
+//
+// `?T` syntax already exists for primitives + pointers. Extending to
+// protocols is a type-system change; no new surface syntax needed.
+
+#import "modules/std.sx";
+main :: () -> s32 { 0; }
diff --git a/examples/issue-0029.sx b/examples/issue-0029.sx
new file mode 100644
index 0000000..c7a5d48
--- /dev/null
+++ b/examples/issue-0029.sx
@@ -0,0 +1,47 @@
+// issue-0029: Feature — add explicit destructors to the GPU protocol so
+// resources can be freed without leaking.
+//
+// ── Proposed additions to library/modules/gpu/api.sx ──────────────────────
+//
+//   destroy_shader  :: (h: ShaderHandle);
+//   destroy_buffer  :: (h: BufferHandle);
+//   destroy_texture :: (h: TextureHandle);
+//
+// ── Why ────────────────────────────────────────────────────────────────────
+//
+// Today, library/modules/ui/glyph_cache.sx's `grow()` method recreates
+// the atlas texture at a larger size but has no way to release the old
+// one — see the comment in metal.sx that explicitly notes the leak. The
+// GL path uses glDeleteTextures(1, @self.texture_id); the GPU protocol
+// has no equivalent yet.
+//
+// ── Implementation notes ──────────────────────────────────────────────────
+//
+// Metal backend: send `release` to the MTLTexture / MTLBuffer /
+// MTLRenderPipelineState (or call CFRelease, since these are
+// CFTypeRef-compatible). Clear the corresponding slot in
+// MetalGPU.textures / buffers / shaders to `null` / 0.
+//
+// GL backend (future): glDeleteTextures / glDeleteBuffers / glDeleteProgram.
+//
+// Handle lifecycle: after destroy, the slot in the backend List is freed.
+// New allocations can take that slot or grow the list. Caller's handles
+// remain valid until destroy. Don't aggressively re-use slots in MVP;
+// keep handles append-only with a `null` marker for destroyed entries
+// (matches the current shape).
+//
+// ── Touch points ──────────────────────────────────────────────────────────
+//
+// library/modules/gpu/api.sx           — add 3 protocol method signatures
+// library/modules/gpu/metal.sx         — implement them (release + null
+//                                         the slot)
+// library/modules/ui/glyph_cache.sx    — call destroy_texture(old_handle)
+//                                         in grow() before creating the
+//                                         new atlas
+//
+// ── Syntax constraint ─────────────────────────────────────────────────────
+//
+// None — straight protocol-method addition.
+
+#import "modules/std.sx";
+main :: () -> s32 { 0; }
diff --git a/examples/issue-0030.sx b/examples/issue-0030.sx
new file mode 100644
index 0000000..08ee85b
--- /dev/null
+++ b/examples/issue-0030.sx
@@ -0,0 +1,57 @@
+// issue-0030: Feature — support `extern` global declarations so a global
+// declared in one sx source file can be referenced from another without
+// parameter threading.
+//
+// ── Use case from the Metal port ──────────────────────────────────────────
+//
+//   // game/main.sx
+//   g_metal_gpu : *MetalGPU = null;
+//
+//   // game/chess/pieces.sx
+//   extern g_metal_gpu : *MetalGPU;
+//
+//   load :: (self: *ChessPieces, path: [:0]u8) {
+//       ...
+//       inline if OS == .ios {
+//           tex := g_metal_gpu.create_texture(w, h, .rgba8, xx pixels);
+//       } else {
+//           // GL path
+//       }
+//   }
+//
+// Today, pieces.load takes `has_gpu: bool, gpu: GPU` parameters and
+// game/main.sx threads them through. Cross-file `extern` globals would
+// let us drop those parameters.
+//
+// ── Implementation sketch ─────────────────────────────────────────────────
+//
+// Mirror how foreign function declarations work — declared in one file,
+// defined elsewhere, linker resolves. Globals already have first-class
+// addresses in the IR; just add an "extern" flag that says "don't emit
+// storage, emit a reference."
+//
+// Files:
+//   - parser (sx surface syntax for `extern G : T;`)
+//   - src/ir/lower.zig (record an extern global stub that resolves at
+//     module-link time)
+//   - src/ir/emit_llvm.zig (emit an `external` LLVM global)
+//
+// ── Syntax constraint ─────────────────────────────────────────────────────
+//
+// `extern G : T;` is a NEW top-level form. Must not clash with:
+//   - `G :: T;`      (type alias)
+//   - `G : T = ---;` (uninitialized global with explicit type)
+//   - `G : T;`       (does this currently parse as anything?)
+//
+// The parser MUST reject `extern G : T = expr;` — extern cannot have an
+// initializer (the definition lives elsewhere).
+//
+// ── Caveat ────────────────────────────────────────────────────────────────
+//
+// Encourages spaghetti globals. Documentation should steer callers toward
+// explicit parameter passing where reasonable. Useful for genuine
+// process-singletons (the active GPU, the active platform, etc.) where
+// threading them through every call site is more noise than signal.
+
+#import "modules/std.sx";
+main :: () -> s32 { 0; }
diff --git a/library/modules/gpu/metal.sx b/library/modules/gpu/metal.sx
index 99aa4cf..68e8cec 100644
--- a/library/modules/gpu/metal.sx
+++ b/library/modules/gpu/metal.sx
@@ -28,6 +28,12 @@ MTL_PIXEL_FORMAT_R8_UNORM    :u64: 10;
 MTL_LOAD_ACTION_CLEAR  :u64: 2;
 MTL_STORE_ACTION_STORE :u64: 1;
 
+// MTLStorageMode. For UI atlases + sprites the CPU needs to write pixels
+// and the GPU needs to sample — `.shared` is the safe default. On iOS-sim
+// under Apple Silicon the convenience class method's default storage
+// isn't reliably shared, so we set it explicitly in metal_create_texture_ios.
+MTL_STORAGE_MODE_SHARED :u64: 0;
+
 // MTLPrimitiveType.
 MTL_PRIMITIVE_TYPE_TRIANGLE :u64: 3;
 
@@ -84,11 +90,18 @@ MetalGPU :: struct {
 }
 
 impl GPU for MetalGPU {
+    // Two-phase init: callers can `init(null, 0, 0)` first to allocate
+    // device + queue eagerly (lets the UI pipeline compile shaders before
+    // UIKit hands us a layer), then re-call `init(layer, w, h)` once the
+    // CAMetalLayer is available. The second call only updates the layer
+    // ref + dims; device/queue are preserved.
     init :: (self: *MetalGPU, target: *void, pixel_w: s32, pixel_h: s32) -> bool {
         inline if OS != .ios { return false; }
-        self.layer   = target;
-        self.pixel_w = pixel_w;
-        self.pixel_h = pixel_h;
+        if target != null {
+            self.layer   = target;
+            self.pixel_w = pixel_w;
+            self.pixel_h = pixel_h;
+        }
         metal_init_ios(self);
     }
 
@@ -200,12 +213,19 @@ impl GPU for MetalGPU {
 // so non-iOS builds never reference the unresolved Metal symbols below.
 // ───────────────────────────────────────────────────────────────────────────
 
+// init() may be called twice: once with target==null to create device +
+// queue eagerly (so the UI pipeline can compile shaders before UIKit
+// has a layer for us), then again with target=CAMetalLayer once
+// `-[SxAppDelegate didFinishLaunching:]` has installed the view.
+// Both calls go through this helper; it's idempotent on the device/queue
+// and only touches the layer when one's been supplied.
 metal_init_ios :: (self: *MetalGPU) -> bool {
     inline if OS != .ios { return false; }
-    if self.layer == null { return false; }
 
-    self.device = MTLCreateSystemDefaultDevice();
-    if self.device == null { return false; }
+    if self.device == null {
+        self.device = MTLCreateSystemDefaultDevice();
+        if self.device == null { return false; }
+    }
 
     msg_oo    : (*void, *void, *void)  -> void  = xx objc_msgSend;
     msg_ou    : (*void, *void, u64)    -> void  = xx objc_msgSend;
@@ -213,15 +233,19 @@ metal_init_ios :: (self: *MetalGPU) -> bool {
     msg_osize : (*void, *void, CGSize) -> void  = xx objc_msgSend;
     msg_o     : (*void, *void)         -> *void = xx objc_msgSend;
 
-    msg_oo(self.layer, sel_registerName("setDevice:".ptr), self.device);
-    msg_ou(self.layer, sel_registerName("setPixelFormat:".ptr), MTL_PIXEL_FORMAT_BGRA8_UNORM);
-    msg_ob(self.layer, sel_registerName("setFramebufferOnly:".ptr), 1);
+    if self.queue == null {
+        self.queue = msg_o(self.device, sel_registerName("newCommandQueue".ptr));
+        if self.queue == null { return false; }
+    }
 
-    size := CGSize.{ width = xx self.pixel_w, height = xx self.pixel_h };
-    msg_osize(self.layer, sel_registerName("setDrawableSize:".ptr), size);
+    if self.layer != null {
+        msg_oo(self.layer, sel_registerName("setDevice:".ptr), self.device);
+        msg_ou(self.layer, sel_registerName("setPixelFormat:".ptr), MTL_PIXEL_FORMAT_BGRA8_UNORM);
+        msg_ob(self.layer, sel_registerName("setFramebufferOnly:".ptr), 1);
 
-    self.queue = msg_o(self.device, sel_registerName("newCommandQueue".ptr));
-    if self.queue == null { return false; }
+        size := CGSize.{ width = xx self.pixel_w, height = xx self.pixel_h };
+        msg_osize(self.layer, sel_registerName("setDrawableSize:".ptr), size);
+    }
 
     true;
 }
@@ -457,6 +481,12 @@ metal_create_texture_ios :: (self: *MetalGPU, w: s32, h: s32, format: TextureFor
         pixel_format, xx w, xx h, 0);
     if desc == null { return 0; }
 
+    // Force shared storage so the CPU can keep writing pixels (atlas updates,
+    // sprite uploads). On iOS-sim under Apple Silicon the convenience class
+    // method's default storage isn't reliably shared for every format.
+    msg_ou_void : (*void, *void, u64) -> void = xx objc_msgSend;
+    msg_ou_void(desc, sel_registerName("setStorageMode:".ptr), MTL_STORAGE_MODE_SHARED);
+
     msg_oo : (*void, *void, *void) -> *void = xx objc_msgSend;
     tex := msg_oo(self.device, sel_registerName("newTextureWithDescriptor:".ptr), desc);
     if tex == null { return 0; }
diff --git a/library/modules/ui/glyph_cache.sx b/library/modules/ui/glyph_cache.sx
index 1e16e0b..44012f1 100755
--- a/library/modules/ui/glyph_cache.sx
+++ b/library/modules/ui/glyph_cache.sx
@@ -1,5 +1,7 @@
 #import "modules/std.sx";
 #import "modules/opengl.sx";
+#import "modules/gpu/types.sx";
+#import "modules/gpu/api.sx";
 #import "modules/stb_truetype.sx";
 #import "modules/ui/types.sx";
 
@@ -176,9 +178,20 @@ GlyphCache :: struct {
     last_shape_len: s64;
     last_shape_size_q: u16;
 
+    // GPU protocol backend. When `has_gpu`, atlas creation + dirty uploads
+    // route through `gpu` instead of raw GL.
+    gpu: GPU = ---;
+    has_gpu: bool = false;
+
     init :: (self: *GlyphCache, path: [:0]u8, default_size: f32) {
+        // Preserve any pre-set GPU dispatch across the zero-out — the
+        // surrounding struct memset would otherwise wipe it.
+        saved_gpu := self.gpu;
+        saved_has_gpu := self.has_gpu;
         // Zero out the entire struct first (parent may be uninitialized with = ---)
         memset(self, 0, size_of(GlyphCache));
+        self.gpu = saved_gpu;
+        self.has_gpu = saved_has_gpu;
 
         // Load font file
         file_size : s32 = 0;
@@ -245,15 +258,25 @@ GlyphCache :: struct {
         val_bytes : s64 = self.hash_cap * 8;  // s64 per slot (s32 would suffice but alignment)
         self.hash_vals = xx context.allocator.alloc(val_bytes);
 
-        // Create OpenGL texture
-        glGenTextures(1, @self.texture_id);
-        glBindTexture(GL_TEXTURE_2D, self.texture_id);
-        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-        glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, self.atlas_width, self.atlas_height, 0, GL_RED, GL_UNSIGNED_BYTE, self.bitmap);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE);
+        // Create the atlas texture. In GPU-protocol mode we create empty and
+        // let the first `flush()` push the (zero-initialized) bitmap via
+        // update_texture_region — same result as the GL path's glTexImage2D
+        // with the zeroed bitmap, but works whether or not the backend
+        // accepts CPU pixel pointers at create time.
+        if self.has_gpu {
+            self.texture_id = self.gpu.create_texture(
+                self.atlas_width, self.atlas_height, .r8, null);
+            self.dirty = true;
+        } else {
+            glGenTextures(1, @self.texture_id);
+            glBindTexture(GL_TEXTURE_2D, self.texture_id);
+            glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+            glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, self.atlas_width, self.atlas_height, 0, GL_RED, GL_UNSIGNED_BYTE, self.bitmap);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE);
+        }
 
         out("GlyphCache initialized: ");
         out(path);
@@ -406,9 +429,14 @@ GlyphCache :: struct {
     // Upload dirty atlas to GPU
     flush :: (self: *GlyphCache) {
         if self.dirty == false { return; }
-        glBindTexture(GL_TEXTURE_2D, self.texture_id);
-        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.atlas_width, self.atlas_height, GL_RED, GL_UNSIGNED_BYTE, self.bitmap);
+        if self.has_gpu {
+            self.gpu.update_texture_region(self.texture_id, 0, 0,
+                self.atlas_width, self.atlas_height, xx self.bitmap);
+        } else {
+            glBindTexture(GL_TEXTURE_2D, self.texture_id);
+            glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+            glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.atlas_width, self.atlas_height, GL_RED, GL_UNSIGNED_BYTE, self.bitmap);
+        }
         self.dirty = false;
     }
 
@@ -464,16 +492,23 @@ GlyphCache :: struct {
         self.atlas_width = new_w;
         self.atlas_height = new_h;
 
-        // Recreate GL texture
-        glDeleteTextures(1, @self.texture_id);
-        glGenTextures(1, @self.texture_id);
-        glBindTexture(GL_TEXTURE_2D, self.texture_id);
-        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-        glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, new_w, new_h, 0, GL_RED, GL_UNSIGNED_BYTE, new_bitmap);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE);
+        // Recreate atlas at the new size.
+        if self.has_gpu {
+            // No destroy_texture in the GPU protocol yet — old atlas
+            // leaks in the backend table until process exit. Atlas grow
+            // is rare so this is acceptable for now.
+            self.texture_id = self.gpu.create_texture(new_w, new_h, .r8, xx new_bitmap);
+        } else {
+            glDeleteTextures(1, @self.texture_id);
+            glGenTextures(1, @self.texture_id);
+            glBindTexture(GL_TEXTURE_2D, self.texture_id);
+            glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+            glTexImage2D(GL_TEXTURE_2D, 0, xx GL_R8, new_w, new_h, 0, GL_RED, GL_UNSIGNED_BYTE, new_bitmap);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, xx GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, xx GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, xx GL_CLAMP_TO_EDGE);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, xx GL_CLAMP_TO_EDGE);
+        }
 
         // Recompute UV coordinates for all cached glyphs
         atlas_wf : f32 = xx new_w;
diff --git a/library/modules/ui/pipeline.sx b/library/modules/ui/pipeline.sx
index 9a10e46..8271fe9 100755
--- a/library/modules/ui/pipeline.sx
+++ b/library/modules/ui/pipeline.sx
@@ -1,6 +1,7 @@
 #import "modules/std.sx";
 #import "modules/allocators.sx";
 #import "modules/opengl.sx";
+#import "modules/gpu/api.sx";
 #import "modules/ui/types.sx";
 #import "modules/ui/render.sx";
 #import "modules/ui/events.sx";
@@ -24,6 +25,23 @@ UIPipeline :: struct {
     has_body: bool;
     parent_allocator: Allocator;
 
+    // GPU protocol backend. When `has_gpu`, the pipeline propagates this
+    // to its renderer + font, and skips the per-frame GL state setup in
+    // commit_gpu (Metal bakes blend mode into the pipeline state).
+    gpu: GPU = ---;
+    has_gpu: bool = false;
+
+    // Set the GPU dispatch BEFORE calling init() / init_font() so the
+    // shaders + atlas land on the right backend.
+    set_gpu :: (self: *UIPipeline, gpu: GPU) {
+        self.gpu = gpu;
+        self.has_gpu = true;
+        self.renderer.gpu = gpu;
+        self.renderer.has_gpu = true;
+        self.font.gpu = gpu;
+        self.font.has_gpu = true;
+    }
+
     init :: (self: *UIPipeline, width: f32, height: f32) {
         self.render_tree = RenderTree.init();
         self.renderer.init();
@@ -149,14 +167,18 @@ UIPipeline :: struct {
     }
 
     commit_gpu :: (self: *UIPipeline) {
-        glEnable(GL_BLEND);
-        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
-        glDisable(GL_DEPTH_TEST);
+        if !self.has_gpu {
+            glEnable(GL_BLEND);
+            glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+            glDisable(GL_DEPTH_TEST);
+        }
 
         self.renderer.begin(self.screen_width, self.screen_height, self.font.texture_id);
         self.renderer.process(@self.render_tree);
         self.renderer.flush();
 
-        glDisable(GL_BLEND);
+        if !self.has_gpu {
+            glDisable(GL_BLEND);
+        }
     }
 }
diff --git a/library/modules/ui/renderer.sx b/library/modules/ui/renderer.sx
index ab92429..9c7646d 100755
--- a/library/modules/ui/renderer.sx
+++ b/library/modules/ui/renderer.sx
@@ -2,6 +2,8 @@
 #import "modules/compiler.sx";
 #import "modules/opengl.sx";
 #import "modules/math";
+#import "modules/gpu/types.sx";
+#import "modules/gpu/api.sx";
 #import "modules/ui/types.sx";
 #import "modules/ui/render.sx";
 #import "modules/ui/glyph_cache.sx";
@@ -13,62 +15,81 @@ UI_VERTEX_BYTES  :s64: 48;
 MAX_UI_VERTICES  :s64: 16384;
 
 UIRenderer :: struct {
+    // GL-side handles. Used when `gpu == null` (every non-iOS target today).
     vao: u32;
     vbo: u32;
     shader: u32;
     proj_loc: s32;
     tex_loc: s32;
+
+    // CPU-side vertex scratch buffer — same for both backends.
     vertices: [*]f32;
     vertex_count: s64;
     screen_width: f32;
     screen_height: f32;
     dpi_scale: f32;
-    white_texture: u32;
+    white_texture: u32;     // GL name OR TextureHandle (both are u32-shaped)
     current_texture: u32;
     draw_calls: s64;
 
-    init :: (self: *UIRenderer) {
-        // Create shader (ES for WASM/WebGL2 + iOS GLES3, Core for desktop GL 3.3)
-        inline if OS == .wasm or OS == .ios {
-            self.shader = create_program(UI_VERT_SRC_ES, UI_FRAG_SRC_ES);
-        } else {
-            self.shader = create_program(UI_VERT_SRC_CORE, UI_FRAG_SRC_CORE);
-        }
-        self.proj_loc = glGetUniformLocation(self.shader, "uProj");
-        self.tex_loc = glGetUniformLocation(self.shader, "uTex");
+    // GPU protocol backend. When `has_gpu`, the renderer routes shader /
+    // buffer / texture / draw calls through `gpu` instead of raw GL. The
+    // chess game sets this on iOS to a boxed `*MetalGPU`.
+    gpu: GPU = ---;
+    has_gpu: bool = false;
+    mtl_shader: ShaderHandle = 0;
+    mtl_vbuf:   BufferHandle = 0;
 
-        // Allocate vertex buffer (CPU side)
+    init :: (self: *UIRenderer) {
+        // Allocate vertex scratch (CPU side) — same for both backends.
         buf_size := MAX_UI_VERTICES * UI_VERTEX_BYTES;
         self.vertices = xx context.allocator.alloc(buf_size);
         memset(self.vertices, 0, buf_size);
         self.vertex_count = 0;
-
-        // Create VAO/VBO
-        glGenVertexArrays(1, @self.vao);
-        glGenBuffers(1, @self.vbo);
-        glBindVertexArray(self.vao);
-        glBindBuffer(GL_ARRAY_BUFFER, self.vbo);
-        glBufferData(GL_ARRAY_BUFFER, xx buf_size, null, GL_DYNAMIC_DRAW);
-
-        // pos (2 floats)
-        glVertexAttribPointer(0, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 0);
-        glEnableVertexAttribArray(0);
-        // uv (2 floats)
-        glVertexAttribPointer(1, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 8);
-        glEnableVertexAttribArray(1);
-        // color (4 floats)
-        glVertexAttribPointer(2, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 16);
-        glEnableVertexAttribArray(2);
-        // params: corner_radius, border_width, rect_w, rect_h
-        glVertexAttribPointer(3, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 32);
-        glEnableVertexAttribArray(3);
-
-        glBindVertexArray(0);
-
         self.dpi_scale = 1.0;
 
-        // 1x1 white texture for solid rects
-        self.white_texture = create_white_texture();
+        if self.has_gpu {
+            // ── Metal backend (via GPU protocol) ───────────────────────
+            self.mtl_shader = self.gpu.create_shader(UI_MSL_SRC, "");
+            self.mtl_vbuf   = self.gpu.create_buffer(buf_size);
+            white_px : [4]u8 = .[255, 255, 255, 255];
+            self.white_texture = self.gpu.create_texture(1, 1, .rgba8, xx @white_px[0]);
+        } else {
+            // ── GL backend ─────────────────────────────────────────────
+            // Create shader (ES for WASM/WebGL2 + iOS GLES3, Core for desktop GL 3.3)
+            inline if OS == .wasm or OS == .ios {
+                self.shader = create_program(UI_VERT_SRC_ES, UI_FRAG_SRC_ES);
+            } else {
+                self.shader = create_program(UI_VERT_SRC_CORE, UI_FRAG_SRC_CORE);
+            }
+            self.proj_loc = glGetUniformLocation(self.shader, "uProj");
+            self.tex_loc = glGetUniformLocation(self.shader, "uTex");
+
+            // Create VAO/VBO
+            glGenVertexArrays(1, @self.vao);
+            glGenBuffers(1, @self.vbo);
+            glBindVertexArray(self.vao);
+            glBindBuffer(GL_ARRAY_BUFFER, self.vbo);
+            glBufferData(GL_ARRAY_BUFFER, xx buf_size, null, GL_DYNAMIC_DRAW);
+
+            // pos (2 floats)
+            glVertexAttribPointer(0, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 0);
+            glEnableVertexAttribArray(0);
+            // uv (2 floats)
+            glVertexAttribPointer(1, 2, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 8);
+            glEnableVertexAttribArray(1);
+            // color (4 floats)
+            glVertexAttribPointer(2, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 16);
+            glEnableVertexAttribArray(2);
+            // params: corner_radius, border_width, rect_w, rect_h
+            glVertexAttribPointer(3, 4, GL_FLOAT, 0, xx UI_VERTEX_BYTES, xx 32);
+            glEnableVertexAttribArray(3);
+
+            glBindVertexArray(0);
+
+            // 1x1 white texture for solid rects
+            self.white_texture = create_white_texture();
+        }
     }
 
     begin :: (self: *UIRenderer, width: f32, height: f32, font_texture: u32) {
@@ -78,15 +99,26 @@ UIRenderer :: struct {
         self.current_texture = font_texture;
         self.draw_calls = 0;
 
-        // Set up GL state once for the entire frame
-        glUseProgram(self.shader);
         proj := Mat4.ortho(0.0, width, height, 0.0, -1.0, 1.0);
-        glUniformMatrix4fv(self.proj_loc, 1, 0, proj.data);
-        glUniform1i(self.tex_loc, 0);
-        glActiveTexture(GL_TEXTURE0);
-        glBindTexture(GL_TEXTURE_2D, font_texture);
-        glBindVertexArray(self.vao);
-        glBindBuffer(GL_ARRAY_BUFFER, self.vbo);
+
+        if self.has_gpu {
+            // Pipeline state + vertex buffer + projection + initial texture.
+            // Metal blend mode + scissor-cleared defaults are baked into
+            // the pipeline state, so no per-frame glEnable/glDisable.
+            self.gpu.set_shader(self.mtl_shader);
+            self.gpu.set_vertex_buffer(self.mtl_vbuf);
+            self.gpu.set_vertex_constants(1, xx proj.data, 64);
+            self.gpu.set_texture(0, font_texture);
+        } else {
+            // GL: bind everything for the frame.
+            glUseProgram(self.shader);
+            glUniformMatrix4fv(self.proj_loc, 1, 0, proj.data);
+            glUniform1i(self.tex_loc, 0);
+            glActiveTexture(GL_TEXTURE0);
+            glBindTexture(GL_TEXTURE_2D, font_texture);
+            glBindVertexArray(self.vao);
+            glBindBuffer(GL_ARRAY_BUFFER, self.vbo);
+        }
     }
 
     bind_texture :: (self: *UIRenderer, tex: u32) {
@@ -202,18 +234,33 @@ UIRenderer :: struct {
                 }
                 case .clip_push: {
                     self.flush();
-                    glEnable(GL_SCISSOR_TEST);
                     dpi := self.dpi_scale;
-                    glScissor(
-                        xx (node.frame.origin.x * dpi),
-                        xx ((self.screen_height - node.frame.origin.y - node.frame.size.height) * dpi),
-                        xx (node.frame.size.width * dpi),
-                        xx (node.frame.size.height * dpi)
-                    );
+                    if self.has_gpu {
+                        // Metal: pixel coords, top-left origin (no Y flip).
+                        self.gpu.set_scissor(
+                            xx (node.frame.origin.x * dpi),
+                            xx (node.frame.origin.y * dpi),
+                            xx (node.frame.size.width * dpi),
+                            xx (node.frame.size.height * dpi),
+                        );
+                    } else {
+                        // GL: pixel coords, bottom-left origin — flip Y.
+                        glEnable(GL_SCISSOR_TEST);
+                        glScissor(
+                            xx (node.frame.origin.x * dpi),
+                            xx ((self.screen_height - node.frame.origin.y - node.frame.size.height) * dpi),
+                            xx (node.frame.size.width * dpi),
+                            xx (node.frame.size.height * dpi)
+                        );
+                    }
                 }
                 case .clip_pop: {
                     self.flush();
-                    glDisable(GL_SCISSOR_TEST);
+                    if self.has_gpu {
+                        self.gpu.disable_scissor();
+                    } else {
+                        glDisable(GL_SCISSOR_TEST);
+                    }
                 }
                 case .opacity_push: {}
                 case .opacity_pop: {}
@@ -225,13 +272,22 @@ UIRenderer :: struct {
     flush :: (self: *UIRenderer) {
         if self.vertex_count == 0 { return; }
 
-        // Only bind the current texture (program, projection, VAO already bound in begin())
-        glBindTexture(GL_TEXTURE_2D, self.current_texture);
-
         upload_size : s64 = self.vertex_count * UI_VERTEX_BYTES;
-        // Use glBufferData to orphan the old buffer and avoid GPU sync stalls
-        glBufferData(GL_ARRAY_BUFFER, xx upload_size, self.vertices, GL_DYNAMIC_DRAW);
-        glDrawArrays(GL_TRIANGLES, 0, xx self.vertex_count);
+
+        if self.has_gpu {
+            // Mirror the GL path: bind current texture before drawing.
+            // current_texture may have changed since the last flush.
+            self.gpu.set_texture(0, self.current_texture);
+            self.gpu.update_buffer(self.mtl_vbuf, xx self.vertices, upload_size);
+            self.gpu.draw_triangles(0, xx self.vertex_count);
+        } else {
+            // Only re-bind the current texture (program, projection, VAO
+            // already bound in begin()). glBufferData orphans the old buffer
+            // to avoid GPU sync stalls.
+            glBindTexture(GL_TEXTURE_2D, self.current_texture);
+            glBufferData(GL_ARRAY_BUFFER, xx upload_size, self.vertices, GL_DYNAMIC_DRAW);
+            glDrawArrays(GL_TRIANGLES, 0, xx self.vertex_count);
+        }
 
         self.vertex_count = 0;
         self.draw_calls += 1;
@@ -458,3 +514,87 @@ void main() {
     }
 }
 GLSL;
+
+// --- Metal (MSL) — single library with vmain/fmain entry points ---
+//
+// `packed_float2 / packed_float4` keep the 12-float interleaved vertex
+// layout (pos2 / uv2 / color4 / params4 = 48 bytes) without padding —
+// MSL's default `float4` has 16-byte alignment and would force a 64-byte
+// struct (see examples/63-metal-clear.sx for the gotcha).
+//
+// Uniform passing: GL uses `glUniformMatrix4fv("uProj", proj)`; Metal
+// receives the projection via `setVertexBytes:length:atIndex:1` (slot 0
+// is the vertex buffer). Texture binding goes through
+// `setFragmentTexture:atIndex:0`.
+
+UI_MSL_SRC :: #string MSL
+#include <metal_stdlib>
+using namespace metal;
+
+struct UIVertex {
+    packed_float2 pos;
+    packed_float2 uv;
+    packed_float4 color;
+    packed_float4 params;
+};
+
+struct VOut {
+    float4 position [[position]];
+    float2 uv;
+    float4 color;
+    float4 params;
+};
+
+vertex VOut vmain(uint vid [[vertex_id]],
+                  constant UIVertex* verts [[buffer(0)]],
+                  constant float4x4& proj [[buffer(1)]]) {
+    UIVertex v = verts[vid];
+    VOut o;
+    o.position = proj * float4(v.pos, 0.0, 1.0);
+    o.uv = float2(v.uv);
+    o.color = float4(v.color);
+    o.params = float4(v.params);
+    return o;
+}
+
+static float roundedBoxSDF(float2 center, float2 half_size, float radius) {
+    float2 q = abs(center) - half_size + float2(radius);
+    return length(max(q, float2(0.0))) + min(max(q.x, q.y), 0.0) - radius;
+}
+
+fragment float4 fmain(VOut in [[stage_in]],
+                      texture2d<float> tex [[texture(0)]]) {
+    constexpr sampler s(coord::normalized, address::clamp_to_edge, filter::linear);
+
+    float mode = in.params.x;
+    float border = in.params.y;
+    float2 rectSize = in.params.zw;
+
+    if (mode < -1.5) {
+        // Image mode (mode == -2.0): sample texture
+        return tex.sample(s, in.uv) * in.color;
+    } else if (mode < 0.0) {
+        // Text mode (mode == -1.0): sample glyph atlas .r as alpha
+        float alpha = tex.sample(s, in.uv).r;
+        float ew = fwidth(alpha) * 0.7;
+        alpha = smoothstep(0.5 - ew, 0.5 + ew, alpha);
+        return float4(in.color.rgb, in.color.a * pow(alpha, 0.9));
+    } else if (mode > 0.0 || border > 0.0) {
+        // Rounded rect: SDF alpha, vertex color only
+        float2 half_size = rectSize * 0.5;
+        float2 center = (in.uv - float2(0.5)) * rectSize;
+        float dist = roundedBoxSDF(center, half_size, mode);
+        float aa = fwidth(dist);
+        float alpha = 1.0 - smoothstep(-aa, aa, dist);
+        if (border > 0.0) {
+            float inner = roundedBoxSDF(center, half_size - float2(border), max(mode - border, 0.0));
+            float border_alpha = smoothstep(-aa, aa, inner);
+            alpha = alpha * max(border_alpha, 0.0);
+        }
+        return float4(in.color.rgb, in.color.a * alpha);
+    } else {
+        // Plain rect: vertex color only
+        return in.color;
+    }
+}
+MSL;