From 68c002ab06327fa0bb3794c9aaa2c6464134171d Mon Sep 17 00:00:00 2001 From: agra Date: Sat, 6 Jun 2026 00:34:21 +0300 Subject: [PATCH] P2.2: content-addressed artifact store (staging -> atomic move, dedup) Local blob store under src/store/, the first real consumer of std.hash. Objects are addressed by lowercase-hex SHA-256: the digest is the storage key and bytes live at /objects/. - put_bytes / put_file compute the digest via std.hash, write to a staging file, then atomically rename into objects/. The rename is the only step that publishes, so an interrupted/failed write never leaves a torn object at the final path. - Dedup: an already-published object short-circuits without re-staging. - stage_write/stage_copy + publish expose the two phases for the test. tests/store_content_addressed.sx asserts the storage key equals std.hash, an independent `shasum -a 256`, and the pinned SHA-256("abc") vector; that dedup stores one object and never rewrites it; that a staged write is invisible until publish and a failed publish leaves no object; and that put_file round-trips bytes. Gate: make build + make test both green. --- .gitignore | 3 + src/store/store.sx | 117 ++++++++++++++++++++++++++++++ tests/store_content_addressed.sx | 120 +++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100644 src/store/store.sx create mode 100644 tests/store_content_addressed.sx diff --git a/.gitignore b/.gitignore index 8687e41..0575287 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ # build artifacts from `make build` build/ + +# scratch store roots / fixtures created by tests (never /tmp) +.sx-tmp/ diff --git a/src/store/store.sx b/src/store/store.sx new file mode 100644 index 0000000..806c81c --- /dev/null +++ b/src/store/store.sx @@ -0,0 +1,117 @@ +// ===================================================================== +// store.sx — content-addressed blob store (subplan 02, Slice 3). +// +// Objects are addressed by the lowercase-hex SHA-256 of their bytes: +// the digest IS the storage key, and the bytes live at +// `/objects/`. This key is what populates an +// Artifact.sha256 / Artifact.storage_key at the domain boundary. +// +// Publish is a two-phase write: bytes are first written to +// `/staging/`, then atomically renamed into +// `/objects/`. The rename is the only operation that makes an +// object visible at its final path, so an interrupted or failed write +// never leaves a torn object — a half-written staging file is not +// reachable as `objects/`. Staging and objects share `` (one +// filesystem), so the rename is atomic. +// +// Dedup: identical bytes hash to the same key, so a put whose object +// already exists returns immediately without re-staging or rewriting. +// ===================================================================== + +#import "modules/std.sx"; +fs :: #import "modules/fs.sx"; +hash :: #import "modules/std/hash.sx"; + +// Failure classes for a put. `Stage` covers a failed staging write, +// `Publish` a failed atomic rename, `Source` an unreadable input file. +StoreErr :: error { + Stage, + Publish, + Source, +} + +// Copy a by-value `[64]u8` digest into a heap `string` key. The hash +// modules return the digest on the stack, so the view over it is only +// valid until the array dies; this materialises an owned, null-terminated +// copy safe to store and use as a path component. +digest_to_key :: (d: [64]u8) -> string { + view := string.{ ptr = @d[0], len = 64 }; + return substr(view, 0, 64); +} + +// SHA-256 of an in-memory buffer, as the lowercase-hex storage key. +digest_of_bytes :: (bytes: string) -> string { + d := hash.sha256_hex(bytes); + return digest_to_key(d); +} + +// SHA-256 of a file's contents (streamed in fixed chunks), as the +// storage key. Raises `Source` if the file can't be opened/read. +digest_of_file :: (path: string) -> (string, !StoreErr) { + maybe := hash.sha256_file(path); + if maybe == null { raise error.Source; } + d := maybe!; + return digest_to_key(d); +} + +Store :: struct { + root: string; + + init :: (root: string) -> Store { + return Store.{ root = root }; + } + + objects_dir :: (self: *Store) -> string { return path_join(self.root, "objects"); } + staging_dir :: (self: *Store) -> string { return path_join(self.root, "staging"); } + object_path :: (self: *Store, key: string) -> string { return path_join(self.root, "objects", key); } + staging_path :: (self: *Store, key: string) -> string { return path_join(self.root, "staging", key); } + + // True once `key`'s bytes are published at their final path. + has :: (self: *Store, key: string) -> bool { + return fs.exists(self.object_path(key)); + } + + // Phase 1: write `bytes` to `staging/`, returning the staging + // path. The bytes are not yet visible at `objects/`. + stage_write :: (self: *Store, key: string, bytes: string) -> (string, !StoreErr) { + if !fs.create_dir_all(self.staging_dir()) { raise error.Stage; } + sp := self.staging_path(key); + if !fs.write_file(sp, bytes) { raise error.Stage; } + return sp; + } + + // Phase 1 (file source): copy `src`'s bytes into `staging/`. + stage_copy :: (self: *Store, key: string, src: string) -> (string, !StoreErr) { + if !fs.create_dir_all(self.staging_dir()) { raise error.Stage; } + sp := self.staging_path(key); + if !fs.copy_file(src, sp) { raise error.Stage; } + return sp; + } + + // Phase 2: atomically move a staged file into `objects/`. After + // this returns the object is published; before it, it never is. + publish :: (self: *Store, staged: string, key: string) -> !StoreErr { + if !fs.create_dir_all(self.objects_dir()) { raise error.Publish; } + if !fs.move(staged, self.object_path(key)) { raise error.Publish; } + return; + } + + // Store in-memory bytes and return their storage key. Dedup: an + // already-published object is returned without re-staging. + put_bytes :: (self: *Store, bytes: string) -> (string, !StoreErr) { + key := digest_of_bytes(bytes); + if self.has(key) { return key; } + sp := try self.stage_write(key, bytes); + try self.publish(sp, key); + return key; + } + + // Store a file's bytes and return their storage key. Dedup as above. + put_file :: (self: *Store, path: string) -> (string, !StoreErr) { + key := try digest_of_file(path); + if self.has(key) { return key; } + sp := try self.stage_copy(key, path); + try self.publish(sp, key); + return key; + } +} diff --git a/tests/store_content_addressed.sx b/tests/store_content_addressed.sx new file mode 100644 index 0000000..e8720e7 --- /dev/null +++ b/tests/store_content_addressed.sx @@ -0,0 +1,120 @@ +// Acceptance for P2.2 — the content-addressed artifact store. +// +// Drives a fresh store rooted under `.sx-tmp/` (never /tmp) and asserts +// the four Slice-3 invariants: +// 1. put → object lands at `objects/` and its bytes round-trip; +// the storage key equals std.hash, an independent `shasum -a 256`, +// and the pinned SHA-256("abc") vector. +// 2. dedup — identical bytes are not stored twice and an existing +// object is never rewritten. +// 3. atomicity — a staged-but-unpublished write is invisible at the +// final path, and a publish that fails before/at the rename leaves +// no object. +// 4. put_file — a file source produces the same key and bytes. +// Exits 0 only if every assertion holds (process.assert aborts otherwise). +#import "modules/std.sx"; +fs :: #import "modules/fs.sx"; +hash :: #import "modules/std/hash.sx"; +process :: #import "modules/process.sx"; +#import "../src/store/store.sx"; + +// SHA-256("abc"), the FIPS 180-4 one-block known-answer vector. +ABC_SHA256 :: "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"; + +// std.hash digest of `s` as a heap string key (independent of the store). +stdhash_key :: (s: string) -> string { + d := hash.sha256_hex(s); + view := string.{ ptr = @d[0], len = 64 }; + return substr(view, 0, 64); +} + +// First 64 hex chars of `shasum -a 256` over `bytes`, via the shell. +// `bytes` must be shell-safe (the fixtures here are plain ASCII). +shasum_key :: (bytes: string) -> string { + cmd := concat("printf '%s' ", concat(bytes, " | shasum -a 256")); + r := process.run(cmd); + process.assert(r != null, "shasum -a 256 must run"); + res := r!; + process.assert(res.exit_code == 0, "shasum -a 256 must exit 0"); + return substr(res.stdout, 0, 64); +} + +// Number of directory entries under `dir`, parsed from `ls -1 | wc -l`. +entry_count :: (dir: string) -> string { + cmd := concat("ls -1 ", concat(dir, " | wc -l | tr -dc '0-9'")); + r := process.run(cmd); + process.assert(r != null, "ls/wc must run"); + res := r!; + return res.stdout; +} + +main :: () -> s32 { + root := ".sx-tmp/store-cas"; + process.run(concat("rm -rf ", root)); // fresh root, even after a crashed prior run + + st := Store.init(root); + + // ── 1. put + content addressing ───────────────────────────────────── + fixture := "abc"; + key, e := st.put_bytes(fixture); + process.assert(!e, "put_bytes(abc) must succeed"); + process.assert(key == ABC_SHA256, "key must equal pinned SHA-256(abc) vector"); + process.assert(key == stdhash_key(fixture), "store key must equal std.hash digest"); + process.assert(key == shasum_key(fixture), "store key must equal shasum -a 256"); + print(" store == std.hash == shasum == vector: {}\n", key); + + process.assert(st.has(key), "object must exist at objects/"); + stored := fs.read_file(st.object_path(key)); + process.assert(stored != null, "stored object must be readable"); + process.assert(stored! == fixture, "stored bytes must equal the input"); + + // ── 2. dedup: same bytes, one object, never rewritten ─────────────── + // Overwrite the object on disk; a deduped re-put must NOT touch it. + process.assert(fs.write_file(st.object_path(key), "TAMPERED"), "tamper write must succeed"); + key2, e2 := st.put_bytes(fixture); + process.assert(!e2, "second put_bytes must succeed"); + process.assert(key2 == key, "dedup: identical bytes yield the same key"); + after := fs.read_file(st.object_path(key)); + process.assert(after! == "TAMPERED", "dedup: existing object must not be rewritten"); + process.assert(entry_count(st.objects_dir()) == "1", "dedup: exactly one object stored"); + // Restore the real bytes so the store is left consistent. + process.assert(fs.write_file(st.object_path(key), fixture), "restore write must succeed"); + print(" dedup: one object, copy skipped on re-put\n"); + + // ── 3. atomicity: staged write is invisible until publish ─────────── + pending := "interrupted-upload-bytes"; + pkey := stdhash_key(pending); + process.assert(!st.has(pkey), "fresh store: pending object must be absent"); + sp, se := st.stage_write(pkey, pending); + process.assert(!se, "stage_write must succeed"); + process.assert(fs.exists(sp), "staged file must exist after stage_write"); + process.assert(!st.has(pkey), "atomicity: object must NOT exist before the rename"); + + // A publish whose staging source is missing fails and creates nothing. + missing := "1111111111111111111111111111111111111111111111111111111111111111"; + process.assert(!st.has(missing), "precondition: no object for the missing key"); + failed := false; + st.publish(st.staging_path(missing), missing) catch { failed = true; }; + process.assert(failed, "publish of a missing staging file must fail"); + process.assert(!st.has(missing), "failed publish must leave no object"); + print(" atomicity: staged write invisible; failed publish leaves no object\n"); + + // ── 4. put_file: file source, same key + bytes ────────────────────── + src := ".sx-tmp/store-cas-src.bin"; + file_bytes := "the quick brown fox\n"; + process.assert(fs.write_file(src, file_bytes), "fixture source file must be written"); + fkey, fe := st.put_file(src); + process.assert(!fe, "put_file must succeed"); + process.assert(fkey == stdhash_key(file_bytes), "put_file key must equal std.hash of the file bytes"); + process.assert(st.has(fkey), "put_file object must be published"); + fstored := fs.read_file(st.object_path(fkey)); + process.assert(fstored! == file_bytes, "put_file stored bytes must equal the file"); + print(" put_file: key {} published\n", fkey); + + // ── cleanup ───────────────────────────────────────────────────────── + process.run(concat("rm -rf ", root)); + fs.delete_file(src); + + print("store_content_addressed: ALL CASES PASS\n"); + return 0; +}