diff --git a/src/store/store.sx b/src/store/store.sx index 806c81c..367d870 100644 --- a/src/store/store.sx +++ b/src/store/store.sx @@ -6,13 +6,20 @@ // `/objects/`. This key is what populates an // Artifact.sha256 / Artifact.storage_key at the domain boundary. // -// Publish is a two-phase write: bytes are first written to -// `/staging/`, then atomically renamed into -// `/objects/`. The rename is the only operation that makes an -// object visible at its final path, so an interrupted or failed write -// never leaves a torn object — a half-written staging file is not -// reachable as `objects/`. Staging and objects share `` (one -// filesystem), so the rename is atomic. +// Publish is a two-phase write: bytes are first written under +// `/staging/`, then atomically renamed into `/objects/`. +// The rename is the only operation that makes an object visible at its +// final path, so an interrupted or failed write never leaves a torn +// object — a half-written staging file is not reachable as +// `objects/`. Staging and objects share `` (one filesystem), +// so the rename is atomic. +// +// `put_bytes` stages the in-memory bytes at `staging/` (the key is +// known up front). `put_file` reads its source exactly once: it copies +// the source into a provisional `staging/incoming-`, then derives the +// key from the SHA-256 of THAT staged file — the exact bytes that get +// published. So `key == digest(published object)` holds even if the +// source is mutated after the copy; the source is never read twice. // // Dedup: identical bytes hash to the same key, so a put whose object // already exists returns immediately without re-staging or rewriting. @@ -56,9 +63,12 @@ digest_of_file :: (path: string) -> (string, !StoreErr) { Store :: struct { root: string; + // Monotonic per-store counter naming `put_file`'s provisional staging + // files, so concurrent file puts don't clobber each other's temp copy. + seq: s64; init :: (root: string) -> Store { - return Store.{ root = root }; + return Store.{ root = root, seq = 0 }; } objects_dir :: (self: *Store) -> string { return path_join(self.root, "objects"); } @@ -80,10 +90,14 @@ Store :: struct { return sp; } - // Phase 1 (file source): copy `src`'s bytes into `staging/`. - stage_copy :: (self: *Store, key: string, src: string) -> (string, !StoreErr) { + // Phase 1 (file source): copy `src` once into a provisional staging + // file `staging/incoming-`. The key isn't known until these staged + // bytes are hashed, so the name is a per-put sequence — never + // `objects/`, so an interrupted copy is never a published object. + stage_temp_copy :: (self: *Store, src: string) -> (string, !StoreErr) { if !fs.create_dir_all(self.staging_dir()) { raise error.Stage; } - sp := self.staging_path(key); + self.seq += 1; + sp := self.staging_path(concat("incoming-", int_to_string(self.seq))); if !fs.copy_file(src, sp) { raise error.Stage; } return sp; } @@ -106,11 +120,18 @@ Store :: struct { return key; } - // Store a file's bytes and return their storage key. Dedup as above. + // Store a file's bytes and return their storage key. The source is + // read exactly once — copied into staging, then hashed there — so the + // returned key is the SHA-256 of the bytes actually published, not of a + // separate read that could disagree. Dedup: if the object already + // exists, the staged copy is dropped and the existing key returned. put_file :: (self: *Store, path: string) -> (string, !StoreErr) { - key := try digest_of_file(path); - if self.has(key) { return key; } - sp := try self.stage_copy(key, path); + sp := try self.stage_temp_copy(path); + key := try digest_of_file(sp); + if self.has(key) { + fs.delete_file(sp); + return key; + } try self.publish(sp, key); return key; } diff --git a/tests/store_content_addressed.sx b/tests/store_content_addressed.sx index e8720e7..5832028 100644 --- a/tests/store_content_addressed.sx +++ b/tests/store_content_addressed.sx @@ -48,6 +48,17 @@ entry_count :: (dir: string) -> string { return res.stdout; } +// Number of `put_file` staging temps (`incoming-*`) left under `dir`. +// 0 means every file-source put cleaned up its staging copy. +incoming_count :: (dir: string) -> string { + cmd := concat("ls -1 ", concat(dir, " 2>/dev/null | grep -c '^incoming-' | tr -dc '0-9'")); + r := process.run(cmd); + process.assert(r != null, "ls/grep must run"); + res := r!; + if res.stdout.len == 0 { return "0"; } + return res.stdout; +} + main :: () -> s32 { root := ".sx-tmp/store-cas"; process.run(concat("rm -rf ", root)); // fresh root, even after a crashed prior run @@ -99,17 +110,44 @@ main :: () -> s32 { process.assert(!st.has(missing), "failed publish must leave no object"); print(" atomicity: staged write invisible; failed publish leaves no object\n"); - // ── 4. put_file: file source, same key + bytes ────────────────────── + // ── 4. put_file: single source read, key == digest of published object src := ".sx-tmp/store-cas-src.bin"; - file_bytes := "the quick brown fox\n"; + file_bytes := "file-source-bytes-123"; // shell-safe: no spaces/newlines process.assert(fs.write_file(src, file_bytes), "fixture source file must be written"); + fkey, fe := st.put_file(src); process.assert(!fe, "put_file must succeed"); - process.assert(fkey == stdhash_key(file_bytes), "put_file key must equal std.hash of the file bytes"); process.assert(st.has(fkey), "put_file object must be published"); + + // The returned key must be the SHA-256 of the bytes ACTUALLY published — + // re-hash the stored object and confirm it equals the key (and equals + // std.hash + shasum -a 256 of the original fixture). fstored := fs.read_file(st.object_path(fkey)); + process.assert(fstored != null, "published object must be readable"); process.assert(fstored! == file_bytes, "put_file stored bytes must equal the file"); - print(" put_file: key {} published\n", fkey); + process.assert(stdhash_key(fstored!) == fkey, "key must equal SHA-256 of the published object"); + process.assert(fkey == stdhash_key(file_bytes), "put_file key must equal std.hash of the file bytes"); + process.assert(fkey == shasum_key(file_bytes), "put_file key must equal shasum -a 256"); + process.assert(incoming_count(st.staging_dir()) == "0", "put_file must clean up its staging temp"); + objs_after_file := entry_count(st.objects_dir()); + print(" put_file: key {} == digest(published object)\n", fkey); + + // Cross-path dedup: put_bytes of identical content yields the SAME key + // and adds no second object; the stored bytes are not rewritten. + bkey, be := st.put_bytes(file_bytes); + process.assert(!be, "cross-path put_bytes must succeed"); + process.assert(bkey == fkey, "put_file and put_bytes of identical content share a key"); + process.assert(entry_count(st.objects_dir()) == objs_after_file, "cross-path dedup adds no object"); + afterb := fs.read_file(st.object_path(fkey)); + process.assert(afterb! == file_bytes, "cross-path dedup must not rewrite the object"); + + // A repeat put_file hits dedup and also drops its staging temp. + fkey2, fe2 := st.put_file(src); + process.assert(!fe2, "repeat put_file must succeed"); + process.assert(fkey2 == fkey, "repeat put_file dedup yields the same key"); + process.assert(entry_count(st.objects_dir()) == objs_after_file, "repeat put_file adds no object"); + process.assert(incoming_count(st.staging_dir()) == "0", "dedup put_file must clean up its staging temp"); + print(" put_file: cross-path dedup, one object, staging cleaned\n"); // ── cleanup ───────────────────────────────────────────────────────── process.run(concat("rm -rf ", root));