diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd06ea2d..ab7f31f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,6 +52,12 @@ jobs: run: cargo test --lib --features test-utils - name: Run e2e tests run: cargo test --test e2e --features test-utils -- --test-threads=1 + - name: Run v12 storage-bound audit attack PoCs + run: cargo test --test poc_commitment_audit_attacks --features test-utils + - name: Run v12 live audit-handler tests + run: cargo test --test poc_audit_handler_live --features test-utils + - name: Run bootstrap-stall PoC regression marker + run: cargo test --test poc_bootstrap_stall --features test-utils doc: name: Documentation diff --git a/Cargo.toml b/Cargo.toml index 1175dc05..766660fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,6 +134,30 @@ name = "e2e" path = "tests/e2e/mod.rs" required-features = ["test-utils"] +# v12 storage-bound audit attack PoCs. Uses the test-only one-shot +# commitment builder/verifier helpers, so it requires the test-utils +# feature. CI runs it via `cargo test --test poc_commitment_audit_attacks +# --features test-utils`. +[[test]] +name = "poc_commitment_audit_attacks" +path = "tests/poc_commitment_audit_attacks.rs" +required-features = ["test-utils"] + +# Live responder-handler tests for the v12 audit. Use +# LmdbStorageConfig::test_default(), gated on test-utils. +[[test]] +name = "poc_audit_handler_live" +path = "tests/poc_audit_handler_live.rs" +required-features = ["test-utils"] + +# Bootstrap-stall DoS regression marker (documents the unfixed attack; the +# eventual fix must land with a follow-up test asserting bounded drain). +# Declared like the other PoC suites so CI invokes it explicitly. +[[test]] +name = "poc_bootstrap_stall" +path = "tests/poc_bootstrap_stall.rs" +required-features = ["test-utils"] + [features] default = ["logging"] # Enable tracing/logging infrastructure. diff --git a/docs/adr/ADR-0002-gossip-triggered-contiguous-subtree-audit.md b/docs/adr/ADR-0002-gossip-triggered-contiguous-subtree-audit.md new file mode 100644 index 00000000..01c70dd2 --- /dev/null +++ b/docs/adr/ADR-0002-gossip-triggered-contiguous-subtree-audit.md @@ -0,0 +1,251 @@ +# ADR-0002: Gossip-triggered contiguous-subtree storage audit + +- **Status:** Proposed +- **Date:** 2026-06-04 +- **Decision owners:** Anselme (@grumbach) +- **Reviewers:** +- **Supersedes:** none +- **Superseded by:** none +- **Related:** none + +## Context + +In this network, nodes are paid to store data chunks. To verify a node actually +holds what it is paid for, each node publishes a signed **storage commitment**: a +Merkle tree built over the chunks it claims to hold (one leaf per chunk, the leaf +being a hash of the chunk's content which incidentally also is its address on the network), reduced to a single root hash and signed by +the node's key. The commitment is spread to neighbouring nodes through the +network's normal periodic message exchange ("gossip"). Any neighbour can then choose to +**audit** the node: ask it to prove it still holds the committed chunks, sampled +probabilistically so that no single audit is expensive but cheating is caught over time. + +Triggered by gossip, the audits run as occasional surprise +exams, with no answer that escapes accounting, every failure is attributable to misbehaviour, including failure to respond in a reasonable time. + +### Relationship to the responsible-chunk audit + +This ADR covers the **storage-commitment audit**: it proves a node still holds +the data it *claims* to hold (the chunks in its own signed commitment) — i.e. it +catches deletion and on-demand relaying of committed data. It does **not** replace +the pre-existing **responsible-chunk audit**, which proves a node holds the data +it *should* hold: a node samples keys a close peer is responsible for (and was +previously hinted to store) and challenges the peer to produce per-key possession +digests. The two are complementary and run side by side — the storage-commitment +audit is gossip-triggered (no timer of its own), while the responsible-chunk audit +keeps its own periodic tick. Each has independent accounting; this ADR's +adaptive-grace and rollout gating apply to the storage-commitment audit only. + +Terms used below: *root* = the single top hash of a node's storage-commitment +Merkle tree. *Leaf* = the hash of one stored chunk. *N* = the number of chunks a +node has committed to. *Subtree* = a contiguous branch of the tree (a node in the +tree plus everything beneath it). *Padding* = empty filler leaves added so the +tree is a clean binary shape when N is not a power of two. + +## Decision Drivers + +- Ensure all nodes actually store the data they claim they are storing +- Keep each proof small and keep steady-state audit traffic low. +- Catch the three real cheating strategies: storing nothing and fetching on demand; deleting some fraction of data; and keeping only chunk *addresses* (which are public) while never holding the actual bytes, then fabricating proofs. +- Reuse the existing cryptographic building blocks (the Merkle tree, the signed commitment, the freshness hash) without inventing new ones. +- Never wrongly penalise honest nodes, even in extreme cases like on small or dense networks where every node legitimately holds almost all of the data. + +## Considered Options + +1. **Keep the previous timer-driven schedule and just make the excusable answers + punishable.** Rejected: an audit answer like "I don't recognise that commitment" + was excusable *precisely because* the audited commitment was stale relative to + what the node had since published. Without fixing the schedule, punishing such + answers would also punish honest nodes whose latest commitment simply hadn't + propagated yet. + +2. **Keep naming individual chunks to audit, but trigger the audit from gossip.** + A better trigger, but it keeps the large, scattered proof (a separate inclusion + path per sampled chunk) and the "auditor names the chunks" model, which lets a + node honestly answer "that chunk isn't in my commitment" — another answer that + has to be excused. + +3. **Gossip-triggered, single contiguous-subtree proof (chosen).** Receiving a + node's commitment is what may launch an audit, checked against that freshly + published commitment. A random value chosen by the auditor deterministically + selects one contiguous branch of the audited node's *own* tree; the node returns + that whole branch plus a small summary of the rest; the auditor rebuilds the + root, spot-checks a few leaves against real chunk bytes, and requires a timely + response. Small proof, no excusable answers, surprises the node. + +4. **Select several branches per audit instead of one.** Rejected: against an + attacker who deletes data in large contiguous blocks, the per-audit chance of + catching them depends only on the *fraction* deleted, not on how many or how + large the branches are. Extra branches only add proof cost; a fresh random + selection each audit covers the tree over time anyway. + +## Decision + +We will make the audit **gossip-triggered** and replace its proof shape with a +**single contiguous-subtree storage proof**, reusing the existing tree, +commitment, and freshness-hash primitives. + +- **Trigger.** When a node ingests a neighbour's commitment during normal + (steady-state) operation, it may start an audit of that neighbour — not every + time, but with a fixed probability and a per-neighbour cooldown, so audits are + occasional surprise exams that keep traffic low. The decision is cooldown-first + then the probability lottery, so a burst of gossip from one peer yields at most + one audit attempt per cooldown window. The audit always checks the neighbour + against the commitment it *just published*, and a *stable* commitment is still + re-audited over time (the trigger fires on every steady-state gossip, not only + on a changed root). This storage-commitment audit has no periodic timer of its + own; it is driven entirely by gossip and runs **alongside** the separate + responsible-chunk audit (see "Relationship to the responsible-chunk audit"), + which keeps its own periodic tick. + *Exception:* gossip received during the node's own bootstrap is cached but does + NOT trigger an audit — the node may itself still be bootstrapping (audits are + gated on that) and its routing-table view is not yet stable. Such a peer is + audited on the first steady-state gossip round after bootstrap drains (within + one sync cycle), so there is no coverage gap. + +- **Subtree selection.** The auditor sends a fresh random value. That value walks + the tree from the root downward (each bit picking left or right) and stops at + the smallest contiguous branch that still contains at least the square root of N + *real* (non-padding) leaves. Stopping on a real-leaf count — rather than at a + fixed depth — is deliberate: a fixed depth can, when the tree is mostly padding, + land on a branch that is entirely padding, so the audit checks nothing. The + real-leaf rule makes an empty selection impossible. The random value alone fixes + *which* branch is selected: the auditor and the audited node each walk the tree + from it independently and arrive at the same branch, so the audited node cannot + choose a convenient branch to present. The auditor then checks that the returned + branch is exactly the one the random value selects and that it contains at least + the square root of the claimed held chunks in real leaves. + +- **The proof.** The audited node returns every leaf of the selected subtree — + each given both as the plain content hash and as a freshness hash (the content + mixed with the auditor's random value) — plus one summary hash per level for the + unselected siblings along the path to the root. Everything outside the selected + branch costs a single hash; nothing there is touched. + +- **Verification, three independent checks.** + - *Structure:* rebuild the root from the returned subtree and the sibling + summaries; it must equal the freshly-published root the audit was started + against. This proves the subtree genuinely belongs to the committed tree. + - *Real bytes:* pick a small fixed number of leaves at random from within the + subtree and demand the original chunk bytes for exactly those keys from the + audited node itself (a second-round surprise challenge under the same nonce), + then confirm both the plain hash (the chunk's content address) and the + freshness hash match the served bytes. Possession is non-delegable: the + auditor needs to hold none of the node's chunks, and a committed key the + node cannot serve is a deterministic failure, never bad luck. + This defeats a node that + rebuilt the tree from public chunk addresses but never held the bytes: it + cannot serve content that hashes to the committed address, so faking a + fraction of leaves survives only with probability (1 − fraction) raised to the + number of spot-checks. + - *Possession in time:* the whole response must arrive within a deadline sized + to hashing the subtree from local disk. A node that doesn't hold the data must + fetch it across the network first and misses the deadline. + +- **Retention — "you stay answerable for what you publish."** A node keeps the + chunk data behind its **last two published commitments**. Two, not one, absorbs + the normal race where an auditor is asking about the commitment a node published + just before its newest one. Because of this, an honest node can always answer an + audit about a commitment it published recently — so "I don't recognise that + commitment" about a recently-published root is now provably misbehaviour, not + lag. + +- **Accounting and False Positives** "That chunk isn't in my commitment" + can never occur, because the auditor only ever challenges leaves of the node's + *own* committed tree, so every challenged leaf is in the commitment by + construction. Failures that are deterministic and cannot be caused by bad luck — a + rebuilt root that doesn't match, a content or freshness hash that doesn't match, + or repudiating a recently-published commitment — are acted on **the first time + they occur**, because re-asking cannot turn a genuine failure into a pass. + Failures that *can* be caused by transient bad luck — a missed response deadline + — keep a small grace allowance of consecutive misses (reset on any success) + before counting, so a momentarily slow but honest node is not punished. This + grace allowance is the *only* failure type that the adaptive scaling below + touches; deterministic failures are always acted on the first time, regardless + of network conditions. + +- **Closeness** A node should mostly hold chunks whose addresses are + near its own. We may flag a selected leaf as suspicious padding only when its + address is implausibly far from the node *relative to how much data overlap is + normal on this network*. On a small, dense network where every node holds nearly + everything, "far" chunks are normal and must never trigger a penalty. This check + is intentionally biased toward missing some padding rather than ever wrongly + penalising an honest node. + +- **Network Resilience** In the event of large churn or generalized network + disruption, to prevent a death spiral, the **timeout** grace allowance (and only + that allowance) scales with how widely *timeouts* are currently being seen: the + number of consecutive deadline misses tolerated is the median recent *timeout* + count across recently-audited peers plus a constant (in a healthy network this is + roughly 0 + 3). Crucially, the scaling is driven by missed-deadline / liveness + signals — never by deterministic failures (a bad root or a bad hash), which are + always acted on immediately and can therefore never be inflated by an attacker to + buy itself more grace. Genuine disruption makes *honest* nodes time out together, + lifting the median and relaxing the deadline tolerance just when the network is + struggling; once conditions normalise the median falls back toward zero and the + tolerance tightens again. Because most nodes are honest, the median sits near + zero in normal operation, so this never weakens detection of a node that is + actually deleting data. + +## Consequences + +### Positive + +- The deterministic nature of the 3 checks makes a faked proof detectable: a structurally wrong, byte-less, or stale answer fails outright, and repeated probabilistic sampling catches the cases that can only be hidden in one branch at a time. +- The probabilistic approach to verification ensures that verification is cheap but over time efficient. +- Each proof is small and contiguous (about the square root of N leaves plus a handful of summary hashes) instead of many scattered inclusion paths. +- Audits are surprise exams pinned to the *freshly published* commitment, so there is no stale-data ambiguity unlike in the previous audit design +- Three independent defences cover the three cheating strategies: structure (belongs to the committed tree), real bytes (actually held, not fabricated from public addresses), and timeliness (held locally, not fetched on demand). +- Acting on the first deterministic failure roughly cuts time-to-detection compared with requiring several strikes, with no added risk of false positives. + +### Negative / Trade-offs + +- **Big-block deletion is caught only proportionally.** An attacker who deletes data in large contiguous blocks is caught, per audit, with probability roughly equal to the fraction deleted — independent of N and of subtree size. We accept this: there is no economic reason to delete a *small* fraction (you save almost nothing and are still eventually caught), and a node that deletes a large fraction to actually save resources is caught within one or two audits. If ever needed, the lever is auditing *more often*, not bigger subtrees. +- **Inflating the claimed size is not fully prevented.** Only the selected subtree and the path summaries are verified each audit, so filler leaves elsewhere could inflate the claimed chunk count. Both the regular audits and the closeness check mitigates this over time. Fully auditing the entire claimed set would be too much effort. We accept this probabilistic approach in which over time cheaters are detected. +- **Retention has a storage cost.** A node must keep the chunk data behind its last two published commitments. This is an accepted cost. +- **The audit format change is breaking.** The whole network must upgrade before the new audit can be relied on and before eviction is enabled. + +### Neutral / Operational + +- Introduces a few tunable settings: the per-gossip audit probability, the per-neighbour cooldown, the number of real-byte spot-checks, and the retention count (two). The grace allowance for missed deadlines reuses the existing strike threshold and applies to deadline misses only. +- The storage-commitment audit needs no periodic timer of its own — it is driven by gossip. (The separate responsible-chunk audit keeps its periodic tick; the two run side by side.) The related "node is capable but has no current commitment" special case is unnecessary on the gossip-triggered path, since that path always has a freshly-published commitment to pin. A silent node needs no special handling for this audit — it simply stops earning storage credit, so all nodes are naturally motivated to gossip. +- At the chosen settings, steady-state audit load is on the order of a handful of small audits per node per hour. + +## Validation + +How we will know this decision remains correct: + +- **Detection holds in simulation.** For deletions spread evenly across a node's + data, the per-audit chance of catching it rises quickly with the square root of + N; for deletions concentrated in large contiguous blocks (the worst case), it is + roughly the deleted fraction per audit. A simulation must confirm both rates and + that, at the chosen settings, a node deleting a meaningful fraction is caught + within one or two audits and a worst-case concentrated large deletion within + about an hour. Detection must not depend on ever sampling the whole tree. + +- **Tests required before this ADR is Accepted.** Branch selection is deterministic + and identical on the auditor and the audited node; selection never lands on an + all-padding branch across many awkward sizes (a regression test for the + fixed-depth flaw this ADR fixes); the root rebuilds correctly from a single-branch + proof; possession verifies from the bytes the audited node itself serves in the + second-round byte challenge (the auditor holding none of them); a committed key + the node cannot serve is a deterministic failure; the real-byte spot-check catches a node that fabricated + freshness hashes, at the expected probability; deterministic failures are acted on + the first time while deadline misses honour the grace allowance; the adaptive + timeout grace responds to widespread timeouts but never to deterministic failures; + repudiating a recently-published commitment fails; the last two published + commitments stay answerable; the response deadline is sized correctly; and a flood + of gossip does not multiply audits. + +- **Operational signals and re-open triggers.** Audits per node per hour stay within + budget; false-positive penalties on a small, dense test network stay at zero + (confirming the closeness leniency and the adaptive grace hold); during induced + churn the network does not enter an eviction death spiral; revisit the + concentrated-deletion trade-off if a real attacker is ever observed deleting below + the economically-irrational threshold; revisit if the maximum supported committed + size is approached. + +## Notes for AI-assisted work + +AI tools may help draft this ADR, but **must not mark it Accepted without human +review**. Accepted ADRs are immutable: create a new superseding ADR rather than +editing an Accepted ADR. diff --git a/src/node.rs b/src/node.rs index e63ec272..8b7a16fd 100644 --- a/src/node.rs +++ b/src/node.rs @@ -46,6 +46,36 @@ impl NodeBuilder { Self { config } } + /// Reject startup in production mode without a usable rewards address. + /// + /// A node that cannot receive payment must not silently run on the + /// production network. The placeholder address shipped in the example + /// config and an empty string both count as "unconfigured". + /// + /// # Errors + /// + /// Returns [`Error::Config`] if `network_mode` is `Production` and + /// `payment.rewards_address` is unset, empty, or the example placeholder. + fn validate_production_rewards_address(config: &NodeConfig) -> Result<()> { + if config.network_mode != NetworkMode::Production { + return Ok(()); + } + let configured = config + .payment + .rewards_address + .as_deref() + .is_some_and(|addr| !addr.is_empty() && addr != "0xYOUR_ARBITRUM_ADDRESS_HERE"); + if configured { + Ok(()) + } else { + Err(Error::Config( + "CRITICAL: Rewards address is not configured. \ + Set payment.rewards_address in config to your Arbitrum wallet address." + .to_string(), + )) + } + } + /// Build and start the node. /// /// # Errors @@ -54,26 +84,7 @@ impl NodeBuilder { pub async fn build(mut self) -> Result { info!("Building ant-node with config: {:?}", self.config); - // Validate rewards address in production - if self.config.network_mode == NetworkMode::Production { - match self.config.payment.rewards_address { - None => { - return Err(Error::Config( - "CRITICAL: Rewards address is not configured. \ - Set payment.rewards_address in config to your Arbitrum wallet address." - .to_string(), - )); - } - Some(ref addr) if addr == "0xYOUR_ARBITRUM_ADDRESS_HERE" || addr.is_empty() => { - return Err(Error::Config( - "CRITICAL: Rewards address is not configured. \ - Set payment.rewards_address in config to your Arbitrum wallet address." - .to_string(), - )); - } - Some(_) => {} - } - } + Self::validate_production_rewards_address(&self.config)?; // Resolve identity and root_dir (may update self.config.root_dir) let identity = Arc::new(Self::resolve_identity(&mut self.config).await?); @@ -150,6 +161,7 @@ impl NodeBuilder { Arc::clone(&p2p_arc), storage_arc, payment_verifier_arc, + Arc::clone(&identity), &self.config.root_dir, fresh_rx, shutdown.clone(), diff --git a/src/payment/metrics.rs b/src/payment/metrics.rs index badd4f55..b59c19f5 100644 --- a/src/payment/metrics.rs +++ b/src/payment/metrics.rs @@ -33,6 +33,18 @@ impl QuotingMetricsTracker { self.close_records_stored.fetch_add(1, Ordering::SeqCst); } + /// Overwrite the counter with an authoritative count of held records. + /// + /// This is the deletion-aware path and the SINGLE source of truth for the + /// priced record count: the handler calls it at quote time with the live + /// LMDB entry count (`current_chunks()`), so any record removed from + /// storage — by delete, prune, or otherwise — is reflected on the next + /// quote with no per-delete bookkeeping to keep in sync. `record_store` + /// remains only an optimistic between-quote hint; the resync overwrites it. + pub fn set_records(&self, count: usize) { + self.close_records_stored.store(count, Ordering::SeqCst); + } + /// Get the number of records stored. #[must_use] pub fn records_stored(&self) -> usize { @@ -62,4 +74,22 @@ mod tests { tracker.record_store(); assert_eq!(tracker.records_stored(), 3); } + + #[test] + fn test_set_records_resyncs_to_authoritative_count() { + let tracker = QuotingMetricsTracker::new(100); + assert_eq!(tracker.records_stored(), 100); + + // Resync down (e.g. after deletions/pruning the store now holds fewer). + tracker.set_records(42); + assert_eq!(tracker.records_stored(), 42); + + // Resync up (e.g. after new stores). + tracker.set_records(57); + assert_eq!(tracker.records_stored(), 57); + + // Resync to zero (empty store). + tracker.set_records(0); + assert_eq!(tracker.records_stored(), 0); + } } diff --git a/src/payment/quote.rs b/src/payment/quote.rs index 5a1a44d9..e662a0eb 100644 --- a/src/payment/quote.rs +++ b/src/payment/quote.rs @@ -237,6 +237,17 @@ impl QuoteGenerator { self.metrics_tracker.record_store(); } + /// Resync the quoting metric to an authoritative count of held records. + /// + /// The quote price is driven by `records_stored()`. A monotonic store + /// counter would let a node delete chunks it was paid to hold yet keep + /// quoting as if it still held everything. Callers pass the authoritative + /// count of records the node ACTUALLY HOLDS (from the storage layer) so the + /// price reflects current holdings, including deletions and pruning. + pub fn resync_records(&self, count: usize) { + self.metrics_tracker.set_records(count); + } + /// Create a merkle candidate quote for batch payment using ML-DSA-65. /// /// Returns a `MerklePaymentCandidateNode` constructed with the node's diff --git a/src/replication/bootstrap.rs b/src/replication/bootstrap.rs index a4ea7026..65025142 100644 --- a/src/replication/bootstrap.rs +++ b/src/replication/bootstrap.rs @@ -125,7 +125,7 @@ pub async fn check_bootstrap_drained( // Hints capacity-rejected at the pending_verify bounds during bootstrap // must be re-delivered by the originating source before drain can be // claimed; otherwise we'd silently mark ourselves complete with - // outstanding work the source still owes us (codex round-2 BLOCKER). + // outstanding work the source still owes us. // The set retires per-source as each source's next admission cycle // completes with zero rejections — see `clear_capacity_rejected`. if !state.capacity_rejected_sources.is_empty() { diff --git a/src/replication/commitment.rs b/src/replication/commitment.rs new file mode 100644 index 00000000..093cce43 --- /dev/null +++ b/src/replication/commitment.rs @@ -0,0 +1,906 @@ +//! Storage-bound audit via piggybacked commitments. +//! +//! Implements the v12 storage-bound audit design: it closes the +//! storage-binding holes where a node could pass audits while holding chunk +//! addresses (not bytes), or answer against a commitment it never gossiped. +//! +//! ## What this module provides +//! +//! - [`StorageCommitment`] — the wire type sent on neighbour-sync gossip +//! and embedded in commitment-bound audit responses. `ML-DSA-65` signed +//! over `(root, key_count, sender_peer_id)` with explicit domain separation. +//! - [`MerkleTree`] — an in-memory Merkle tree over `(key, BLAKE3(bytes))` +//! leaves. Rebuilt by the responder when its key set changes; produces +//! inclusion paths used in audit responses. +//! - [`commitment_hash`] — the auditor's pin: a `BLAKE3` digest over the +//! full signed commitment blob. Audit challenges carry this; audit +//! responses must include a commitment that hashes to the same value. +//! - [`verify_path`] — auditor's per-key check: rebuilds the leaf from +//! `(key, bytes_hash)` and verifies the inclusion path against the +//! committed root. +//! +//! Nothing else (responder gossip loop, auditor verify path, +//! reward-eligibility cache) lives here yet — that's the next phase. + +use blake3::Hasher; +use saorsa_pqc::api::sig::{ + ml_dsa_65, MlDsaPublicKey, MlDsaSecretKey, MlDsaSignature, MlDsaVariant, +}; +use serde::{Deserialize, Serialize}; + +use crate::ant_protocol::XorName; + +/// Domain-separation tag for the commitment signature. +/// +/// Signed payload is BLAKE3 over (this tag || canonical commitment fields). +pub const DOMAIN_COMMITMENT: &[u8] = b"autonomi.ant.replication.storage_commitment.v1"; + +/// Domain-separation tag for the auditor's pin: BLAKE3 over (this tag || +/// canonical commitment blob). +pub const DOMAIN_COMMITMENT_HASH: &[u8] = b"autonomi.ant.replication.commitment_hash.v1"; + +/// Domain-separation tag for Merkle leaves: `BLAKE3(this || key || H(bytes))`. +pub const DOMAIN_LEAF: &[u8] = b"autonomi.ant.replication.storage_leaf.v1"; + +/// Domain-separation tag for Merkle internal nodes: `BLAKE3(this || left || right)`. +pub const DOMAIN_NODE: &[u8] = b"autonomi.ant.replication.storage_node.v1"; + +/// Maximum number of keys a single commitment may cover. +/// +/// Bounds the Merkle path depth (audit responses carry `O(log2 key_count)` +/// hashes per key) and the responder-side tree memory. A node storing more +/// keys than this would need to split its claim — out of scope for v1. +pub const MAX_COMMITMENT_KEY_COUNT: u32 = 1_000_000; + +/// Signed storage commitment. +/// +/// Piggybacked on neighbour-sync gossip. The signature commits to the +/// Merkle root, key count, sender peer ID, **and the sender's ML-DSA-65 +/// public key** under [`DOMAIN_COMMITMENT`]. +/// +/// Embedding the public key lets any receiver verify the signature +/// without an external `PeerId → MlDsaPublicKey` lookup. Binding the +/// public key in the signed payload prevents a key-swap attack where an +/// adversary keeps the message body but re-signs it under a different key +/// to claim a different identity. The peer-id binding (gate 2a in +/// `verify_commitment_bound_response`) still ensures the embedded key +/// belongs to the gossiping peer. +/// +/// # Wire size +/// +/// One commitment is approximately 5.3 KiB: +/// - root: 32 B +/// - `key_count`: 4 B +/// - `sender_peer_id`: 32 B +/// - `sender_public_key`: 1952 B (ML-DSA-65 public key) +/// - signature: 3293 B (ML-DSA-65 signature) +/// +/// Piggybacked on every `NeighborSyncRequest`/`Response` (~1 h interval +/// per close-group peer at the neighbour-sync cooldown cadence). At a +/// realistic close-group size of 8 with bidirectional sync, that's +/// roughly 8 × 2 × 5.3 KiB / hour = ~85 KiB/h of additional gossip +/// per node. Negligible against typical chunk-transfer bandwidth. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct StorageCommitment { + /// Merkle root over the responder's claimed keys. + pub root: [u8; 32], + /// Number of leaves committed over. + pub key_count: u32, + /// Sender peer ID, bound to the signature. + pub sender_peer_id: [u8; 32], + /// Sender's ML-DSA-65 public key bytes (1952 bytes). Embedded so + /// receivers can verify the signature without a separate pubkey + /// directory. Bound by the signature. + pub sender_public_key: Vec, + /// ML-DSA-65 signature over canonical commitment fields. 3293 bytes. + pub signature: Vec, +} + +// --------------------------------------------------------------------------- +// Hashing helpers +// --------------------------------------------------------------------------- + +/// Compute the Merkle leaf hash for `(key, bytes_hash)`. +/// +/// `bytes_hash` is BLAKE3 over the record bytes; the leaf binds the key to +/// the content so an adversary cannot reuse a leaf for a different chunk. +#[must_use] +pub fn leaf_hash(key: &XorName, bytes_hash: &[u8; 32]) -> [u8; 32] { + let mut h = Hasher::new(); + h.update(DOMAIN_LEAF); + h.update(key); + h.update(bytes_hash); + *h.finalize().as_bytes() +} + +/// Combine two child hashes into a Merkle internal-node hash. +#[must_use] +pub fn node_hash(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { + let mut h = Hasher::new(); + h.update(DOMAIN_NODE); + h.update(left); + h.update(right); + *h.finalize().as_bytes() +} + +/// The auditor's pin: `BLAKE3(DOMAIN_COMMITMENT_HASH || postcard(commitment))`. +/// +/// Equal commitments produce equal hashes; any change to `root`, `key_count`, +/// peer ID, or signature changes the hash because postcard's canonical +/// encoding includes a length prefix for `signature`. The audit challenge +/// carries this value; the audit response must include a commitment that +/// hashes to the same value, defeating fresh-commitment substitution. +/// +/// Postcard encoding is the same canonical wire form the rest of the +/// replication protocol uses (`MessageCodec::encode`), so an encoded +/// commitment from a `NeighborSyncRequest` produces the same hash as the +/// same commitment received in an `AuditResponse`. +/// +/// # Errors +/// +/// Returns `None` only if postcard fails to serialize the commitment, which +/// in practice means the signature is somehow `> isize::MAX` bytes — not +/// reachable for ML-DSA-65 (3293 bytes). Callers may safely treat `None` as +/// a malformed commitment and drop it. +#[must_use] +pub fn commitment_hash(c: &StorageCommitment) -> Option<[u8; 32]> { + let serialized = postcard::to_allocvec(c).ok()?; + let mut h = Hasher::new(); + h.update(DOMAIN_COMMITMENT_HASH); + h.update(&serialized); + Some(*h.finalize().as_bytes()) +} + +/// Canonical bytes the ML-DSA signature covers: the commitment fields +/// minus the signature itself. +/// +/// `sender_public_key` is included so an adversary cannot keep the body +/// and re-sign under a different key (the audit-time verifier would +/// otherwise accept the swap because verification uses the embedded key). +fn commitment_signed_payload( + root: &[u8; 32], + key_count: u32, + sender_peer_id: &[u8; 32], + sender_public_key: &[u8], +) -> Vec { + let mut v = Vec::with_capacity(32 + 4 + 32 + 4 + sender_public_key.len()); + v.extend_from_slice(root); + v.extend_from_slice(&key_count.to_le_bytes()); + v.extend_from_slice(sender_peer_id); + // Length-prefix the pubkey so two different (key, suffix) splits cannot + // produce the same byte stream (canonical encoding). + let pk_len = u32::try_from(sender_public_key.len()).unwrap_or(u32::MAX); + v.extend_from_slice(&pk_len.to_le_bytes()); + v.extend_from_slice(sender_public_key); + v +} + +// --------------------------------------------------------------------------- +// Merkle tree +// --------------------------------------------------------------------------- + +/// In-memory Merkle tree over the responder's claimed keys. +/// +/// Leaves are `BLAKE3(DOMAIN_LEAF || key || BLAKE3(bytes))`, sorted by +/// `key`. Internal nodes are `BLAKE3(DOMAIN_NODE || left || right)`. When +/// a level has an odd number of nodes, the last node is paired with +/// **itself** — i.e. `node_hash(x, x)` — so the level above has +/// `ceil(n/2)` nodes. This is a standard self-pair construction (NOT +/// node promotion) and deterministically maps any non-empty key set to +/// a single root. +/// +/// Rebuilt by the responder whenever its key set changes meaningfully +/// (debounced in the integration layer; not this module's concern). +pub struct MerkleTree { + /// Sorted leaves, indexed by their position in the sorted key set. + /// + /// `leaves[i] = (key_i, leaf_hash(key_i, bytes_hash_i))`. + leaves: Vec<(XorName, [u8; 32])>, + /// Tree levels, level 0 is the leaves and the last level is the root. + /// + /// `levels[0].len() == leaves.len()`; `levels[L].len() == 1` where L + /// is the root level. + levels: Vec>, +} + +impl MerkleTree { + /// Build a Merkle tree over `(key, bytes_hash)` pairs. + /// + /// `entries` does not need to be sorted; this method sorts internally + /// so the produced root is deterministic per key set. Duplicate keys + /// are an error: the responder must deduplicate before calling. + /// + /// # Errors + /// + /// Returns an error if `entries` is empty (no commitment to make), if + /// `entries.len() > MAX_COMMITMENT_KEY_COUNT`, or if it contains + /// duplicate keys. + pub fn build(mut entries: Vec<(XorName, [u8; 32])>) -> Result { + if entries.is_empty() { + return Err(CommitmentError::EmptyKeySet); + } + if entries.len() > MAX_COMMITMENT_KEY_COUNT as usize { + return Err(CommitmentError::TooManyKeys(entries.len())); + } + + entries.sort_by_key(|a| a.0); + for w in entries.windows(2) { + if let [a, b] = w { + if a.0 == b.0 { + return Err(CommitmentError::DuplicateKey(a.0)); + } + } + } + + let leaves: Vec<(XorName, [u8; 32])> = entries + .into_iter() + .map(|(k, bh)| { + let lh = leaf_hash(&k, &bh); + (k, lh) + }) + .collect(); + + let mut level: Vec<[u8; 32]> = leaves.iter().map(|(_, h)| *h).collect(); + let mut levels = vec![level.clone()]; + while level.len() > 1 { + level = build_next_level(&level); + levels.push(level.clone()); + } + + Ok(Self { leaves, levels }) + } + + /// The Merkle root of this tree. + /// + /// `unwrap`-free: `build` guarantees at least one level with at least + /// one entry, so `last().first()` is always `Some`. + #[must_use] + pub fn root(&self) -> [u8; 32] { + // SAFETY: build() enforces non-empty entries → non-empty leaves → + // non-empty levels → last level has exactly one hash. + self.levels + .last() + .and_then(|l| l.first()) + .copied() + .unwrap_or([0u8; 32]) + } + + /// The number of leaves (== claimed keys). + #[must_use] + pub fn key_count(&self) -> u32 { + // Cast is safe because build() rejects > MAX_COMMITMENT_KEY_COUNT. + u32::try_from(self.leaves.len()).unwrap_or(u32::MAX) + } + + /// Inclusion path for `key` from its leaf up to (but not including) + /// the root. + /// + /// Returns `None` if `key` is not in this tree. + #[must_use] + pub fn path_for(&self, key: &XorName) -> Option> { + let idx = self.leaves.binary_search_by(|(k, _)| k.cmp(key)).ok()?; + + let mut path = Vec::with_capacity(self.levels.len()); + let mut i = idx; + for level in &self.levels[..self.levels.len().saturating_sub(1)] { + // Sibling is the *other* half of the pair containing `i`. If + // `i` is the unpaired last node at this level, its sibling is + // itself (matches the self-pair construction in + // `build_next_level`). + let sibling_idx = if i % 2 == 0 { + if i + 1 < level.len() { + i + 1 + } else { + i + } + } else { + i - 1 + }; + path.push(level[sibling_idx]); + i /= 2; + } + Some(path) + } + + /// Iterate over `(key, leaf_hash)` pairs in sorted order. Test-only. + #[cfg(test)] + pub(crate) fn iter_leaves(&self) -> impl Iterator { + self.leaves.iter() + } + + /// The keys this tree commits to, in sorted order. + /// + /// `sorted_keys()[i]` is the key at leaf index `i`. Used by the + /// responder's audit-answer path to recover the `leaf_index` field + /// for a challenged key in `O(log n)` via binary search. + #[must_use] + pub fn sorted_keys(&self) -> Vec { + self.leaves.iter().map(|(k, _)| *k).collect() + } + + /// The key at sorted leaf index `idx`, if in range. + /// + /// Used by the subtree-proof builder to enumerate the keys of a + /// contiguous leaf range without cloning the whole key list. + #[must_use] + pub fn key_at(&self, idx: usize) -> Option { + self.leaves.get(idx).map(|(k, _)| *k) + } + + /// The node hash at `(level, index)`, where `level` counts up from the + /// leaves (`level == 0` is the leaf level, the last level is the root). + /// + /// Returns `None` if out of range. Used by the subtree-proof builder to + /// read sibling cut-hashes along the path from the root to the selected + /// subtree; honours the same left-packed self-pair construction as the + /// rest of the tree (a caller asking for an out-of-range sibling on an + /// odd-length level should substitute the node itself). + #[must_use] + pub fn node_at(&self, level: usize, index: u64) -> Option<[u8; 32]> { + let index = usize::try_from(index).ok()?; + self.levels.get(level).and_then(|l| l.get(index)).copied() + } + + /// The number of levels in the tree (`1` for a single-leaf tree; the + /// last index is the root level). `depth == levels_count() - 1`. + #[must_use] + pub fn levels_count(&self) -> usize { + self.levels.len() + } +} + +/// Build the next level up from `cur`. Odd-length levels pair the last +/// node with itself (`node_hash(x, x)`) so the level above has +/// `ceil(n/2)` nodes. Keeps the tree balanced without needing a dummy +/// leaf domain. +fn build_next_level(cur: &[[u8; 32]]) -> Vec<[u8; 32]> { + let mut next = Vec::with_capacity(cur.len().div_ceil(2)); + let mut i = 0; + while i < cur.len() { + let left = &cur[i]; + let right = if i + 1 < cur.len() { &cur[i + 1] } else { left }; + next.push(node_hash(left, right)); + i += 2; + } + next +} + +/// Verify an inclusion path against a commitment of size `key_count`. +/// +/// `leaf_index` is the responder's position of this leaf in the sorted +/// leaf set; the commitment's `key_count` comes from +/// `StorageCommitment.key_count`. +/// At each level of the path, if the current index is even, the current +/// hash is the left child and we compute `node_hash(self, sibling)`; +/// otherwise it is the right child and we compute `node_hash(sibling, self)`. +/// +/// Returns `true` iff: +/// - `leaf_index < key_count` (rejects out-of-range claims), AND +/// - `path.len() == ceil(log2(key_count))` for `key_count > 1`, or +/// `path.is_empty()` for `key_count == 1` (rejects wrong-shape paths +/// before doing any hashing), AND +/// - the recomputed root equals `expected_root`. +#[must_use] +pub fn verify_path( + leaf: &[u8; 32], + path: &[[u8; 32]], + leaf_index: usize, + key_count: u32, + expected_root: &[u8; 32], +) -> bool { + if key_count == 0 + || key_count > MAX_COMMITMENT_KEY_COUNT + || (leaf_index as u64) >= u64::from(key_count) + { + return false; + } + // Tree depth = ceil(log2(key_count)). For a power-of-two `n`, + // `n.next_power_of_two() == n` so trailing_zeros == log2(n). For non + // powers-of-two, next_power_of_two rounds up so trailing_zeros gives + // ceil(log2). Special case: key_count == 1 → next_power_of_two == 1 + // → trailing_zeros == 0 → empty path, which matches the single-leaf + // tree's root == leaf invariant. + // + // `checked_next_power_of_two` returns None on overflow; combined with + // the MAX_COMMITMENT_KEY_COUNT cap above it cannot fail in practice, + // but the explicit check is profile-independent (release vs debug + // would otherwise differ on overflow per Rust's primitive docs). + let Some(rounded) = key_count.checked_next_power_of_two() else { + return false; + }; + let expected_path_len = rounded.trailing_zeros() as usize; + if path.len() != expected_path_len { + return false; + } + + let mut cur = *leaf; + let mut i = leaf_index; + for sibling in path { + cur = if i % 2 == 0 { + node_hash(&cur, sibling) + } else { + node_hash(sibling, &cur) + }; + i /= 2; + } + cur == *expected_root +} + +// --------------------------------------------------------------------------- +// Sign + verify +// --------------------------------------------------------------------------- + +/// Sign a commitment's `(root, key_count, sender_peer_id, sender_public_key)` +/// with `secret_key`. +/// +/// The signature is over the canonical signed payload (see +/// `commitment_signed_payload`) under [`DOMAIN_COMMITMENT`]. +/// +/// # Errors +/// +/// Returns an error if the underlying ML-DSA-65 signer fails. +pub fn sign_commitment( + secret_key: &MlDsaSecretKey, + root: &[u8; 32], + key_count: u32, + sender_peer_id: &[u8; 32], + sender_public_key: &[u8], +) -> Result, CommitmentError> { + let payload = commitment_signed_payload(root, key_count, sender_peer_id, sender_public_key); + let dsa = ml_dsa_65(); + let sig = dsa + .sign_with_context(secret_key, &payload, DOMAIN_COMMITMENT) + .map_err(|e| CommitmentError::SignatureFailed(e.to_string()))?; + Ok(sig.to_bytes()) +} + +/// Verify a commitment's signature using the embedded `sender_public_key`. +/// +/// Returns `true` iff the signature is valid for `(root, key_count, +/// sender_peer_id, sender_public_key)` under `c.sender_public_key` and +/// [`DOMAIN_COMMITMENT`]. Returns `false` on key-format or signature-format +/// errors so the caller can simply drop the gossip. +/// +/// Verifying against the embedded key removes the need for an external +/// `PeerId → MlDsaPublicKey` lookup. The peer-id binding gate in +/// `ingest_peer_commitment` (and the auditor's `evaluate_subtree_structure`) +/// still ensures the embedded key belongs to the claimed peer. +#[must_use] +pub fn verify_commitment_signature(c: &StorageCommitment) -> bool { + let Ok(public_key) = MlDsaPublicKey::from_bytes(MlDsaVariant::MlDsa65, &c.sender_public_key) + else { + return false; + }; + verify_commitment_signature_with_key(c, &public_key) +} + +/// Verify a commitment's signature against an externally provided key. +/// +/// Test-helper variant. Production code should use [`verify_commitment_signature`] +/// since the key is embedded in the commitment. +#[must_use] +pub fn verify_commitment_signature_with_key( + c: &StorageCommitment, + public_key: &MlDsaPublicKey, +) -> bool { + let payload = commitment_signed_payload( + &c.root, + c.key_count, + &c.sender_peer_id, + &c.sender_public_key, + ); + let Ok(sig) = MlDsaSignature::from_bytes(MlDsaVariant::MlDsa65, &c.signature) else { + return false; + }; + let dsa = ml_dsa_65(); + dsa.verify_with_context(public_key, &payload, &sig, DOMAIN_COMMITMENT) + .unwrap_or(false) +} + +// --------------------------------------------------------------------------- +// Errors +// --------------------------------------------------------------------------- + +/// Errors from commitment construction or verification. +#[derive(Debug, Clone, thiserror::Error)] +pub enum CommitmentError { + /// `MerkleTree::build` was called with an empty key set. + #[error("cannot build commitment over empty key set")] + EmptyKeySet, + /// Key set exceeds [`MAX_COMMITMENT_KEY_COUNT`]. + #[error("commitment key count {0} exceeds MAX_COMMITMENT_KEY_COUNT")] + TooManyKeys(usize), + /// `MerkleTree::build` received the same key twice. + #[error("duplicate key in commitment: {}", hex::encode(.0))] + DuplicateKey(XorName), + /// Underlying ML-DSA-65 signer failed. + #[error("commitment signing failed: {0}")] + SignatureFailed(String), +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::*; + + fn xn(byte: u8) -> XorName { + [byte; 32] + } + + fn bh(byte: u8) -> [u8; 32] { + [byte ^ 0x5A; 32] + } + + #[test] + fn empty_key_set_rejected() { + let result = MerkleTree::build(vec![]); + assert!(matches!(result, Err(CommitmentError::EmptyKeySet))); + } + + #[test] + fn duplicate_keys_rejected() { + let result = MerkleTree::build(vec![(xn(1), bh(1)), (xn(1), bh(2))]); + assert!(matches!(result, Err(CommitmentError::DuplicateKey(_)))); + } + + #[test] + fn single_leaf_tree_root_is_leaf_hash() { + let key = xn(1); + let bytes_hash = bh(1); + let tree = MerkleTree::build(vec![(key, bytes_hash)]).unwrap(); + assert_eq!(tree.root(), leaf_hash(&key, &bytes_hash)); + assert_eq!(tree.key_count(), 1); + assert_eq!(tree.path_for(&key), Some(vec![])); + // Empty path verifies trivially (root == leaf). + assert!(verify_path( + &leaf_hash(&key, &bytes_hash), + &[], + 0, + 1, + &tree.root() + )); + } + + #[test] + fn two_leaf_tree_root_combines_both_leaves() { + let entries = vec![(xn(1), bh(1)), (xn(2), bh(2))]; + let tree = MerkleTree::build(entries).unwrap(); + // Sorted order: xn(1), xn(2). + let l1 = leaf_hash(&xn(1), &bh(1)); + let l2 = leaf_hash(&xn(2), &bh(2)); + assert_eq!(tree.root(), node_hash(&l1, &l2)); + } + + #[test] + fn root_is_deterministic_regardless_of_input_order() { + let mut a = vec![(xn(3), bh(3)), (xn(1), bh(1)), (xn(2), bh(2))]; + let mut b = vec![(xn(2), bh(2)), (xn(3), bh(3)), (xn(1), bh(1))]; + let tree_a = MerkleTree::build(a.clone()).unwrap(); + let tree_b = MerkleTree::build(b.clone()).unwrap(); + a.sort_by_key(|x| x.0); + b.sort_by_key(|x| x.0); + assert_eq!(tree_a.root(), tree_b.root()); + } + + fn xn_u32(i: u32) -> XorName { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_le_bytes()); + k + } + + fn bh_u32(i: u32) -> [u8; 32] { + let mut h = [0u8; 32]; + h[..4].copy_from_slice(&i.to_le_bytes()); + h[4] = 0x5A; + h + } + + #[test] + fn paths_verify_for_every_key_at_various_sizes() { + for n in [1u32, 2, 3, 4, 5, 7, 8, 16, 17, 100, 333] { + let entries: Vec<_> = (0..n).map(|i| (xn_u32(i), bh_u32(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let key_count = tree.key_count(); + for (idx, (k, _)) in tree.iter_leaves().enumerate() { + let path = tree.path_for(k).expect("path for present key"); + let bytes_hash = entries.iter().find(|(kk, _)| kk == k).unwrap().1; + let lh = leaf_hash(k, &bytes_hash); + assert!( + verify_path(&lh, &path, idx, key_count, &root), + "path verify failed at n={n} idx={idx}", + ); + } + } + } + + #[test] + fn path_for_absent_key_is_none() { + let tree = MerkleTree::build(vec![(xn(1), bh(1)), (xn(2), bh(2))]).unwrap(); + assert!(tree.path_for(&xn(99)).is_none()); + } + + #[test] + fn tampered_bytes_hash_breaks_path_verify() { + // Use 8 distinct sorted keys so the index in `entries` matches the + // sorted leaf index in the tree. + let entries: Vec<_> = (1..=8u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let (k, _) = &entries[3]; + let path = tree.path_for(k).unwrap(); + + let wrong_bytes_hash = [0xFFu8; 32]; + let lh = leaf_hash(k, &wrong_bytes_hash); + assert!(!verify_path(&lh, &path, 3, 8, &root)); + } + + #[test] + fn tampered_path_node_breaks_verify() { + let entries: Vec<_> = (1..=8u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let (k, _) = &entries[3]; + let mut path = tree.path_for(k).unwrap(); + path[0][0] ^= 0x01; + let lh = leaf_hash(k, &bh(4)); + assert!(!verify_path(&lh, &path, 3, 8, &root)); + } + + #[test] + fn wrong_leaf_index_breaks_verify() { + let entries: Vec<_> = (1..=8u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let (k, _) = &entries[3]; + let path = tree.path_for(k).unwrap(); + let lh = leaf_hash(k, &bh(4)); + // Correct index is 3; using 2 should fail because the left/right + // child ordering swaps. + assert!(!verify_path(&lh, &path, 2, 8, &root)); + assert!(verify_path(&lh, &path, 3, 8, &root)); + } + + #[test] + fn out_of_range_leaf_index_rejected() { + let entries: Vec<_> = (1..=8u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let (k, _) = &entries[3]; + let path = tree.path_for(k).unwrap(); + let lh = leaf_hash(k, &bh(4)); + // leaf_index >= key_count must be rejected without even hashing. + assert!(!verify_path(&lh, &path, 8, 8, &root)); + assert!(!verify_path(&lh, &path, 99, 8, &root)); + // Valid baseline. + assert!(verify_path(&lh, &path, 3, 8, &root)); + } + + #[test] + fn wrong_path_length_rejected_pre_hashing() { + let entries: Vec<_> = (1..=8u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries.clone()).unwrap(); + let root = tree.root(); + let (k, _) = &entries[3]; + let path = tree.path_for(k).unwrap(); + let lh = leaf_hash(k, &bh(4)); + // For key_count=8 the expected path length is 3 (ceil(log2(8))=3). + assert_eq!(path.len(), 3); + // Truncating breaks structural check. + let short: Vec<_> = path.iter().take(2).copied().collect(); + assert!(!verify_path(&lh, &short, 3, 8, &root)); + // Padding too long also breaks structural check. + let mut long = path; + long.push([0; 32]); + assert!(!verify_path(&lh, &long, 3, 8, &root)); + } + + #[test] + fn zero_key_count_rejected() { + // Defensive: even with an empty path and correct-shape root, a + // commitment claiming zero keys is nonsensical. + let lh = [0u8; 32]; + assert!(!verify_path(&lh, &[], 0, 0, &[0u8; 32])); + } + + #[test] + fn out_of_protocol_key_count_rejected() { + // Wire-supplied key_count exceeding MAX_COMMITMENT_KEY_COUNT is + // refused before any hashing. Guards an overflow found in review: + // `next_power_of_two()` would otherwise panic in debug and wrap in + // release on key_count > 1 << 31. + let lh = [0u8; 32]; + assert!(!verify_path( + &lh, + &[], + 0, + MAX_COMMITMENT_KEY_COUNT + 1, + &[0u8; 32] + )); + assert!(!verify_path(&lh, &[], 0, u32::MAX, &[0u8; 32])); + } + + fn pk_bytes(pk: &MlDsaPublicKey) -> Vec { + pk.to_bytes() + } + + #[test] + fn sign_and_verify_roundtrip() { + let dsa = ml_dsa_65(); + let (pk, sk) = dsa.generate_keypair().unwrap(); + let entries: Vec<_> = (0..5u8).map(|i| (xn(i), bh(i))).collect(); + let tree = MerkleTree::build(entries).unwrap(); + let root = tree.root(); + let key_count = tree.key_count(); + let peer_id = [0xAB; 32]; + let pk_b = pk_bytes(&pk); + let signature = sign_commitment(&sk, &root, key_count, &peer_id, &pk_b).unwrap(); + let c = StorageCommitment { + root, + key_count, + sender_peer_id: peer_id, + sender_public_key: pk_b, + signature, + }; + // Verifies via embedded key, no external lookup needed. + assert!(verify_commitment_signature(&c)); + } + + #[test] + fn signature_fails_when_root_tampered() { + let dsa = ml_dsa_65(); + let (pk, sk) = dsa.generate_keypair().unwrap(); + let root = [0u8; 32]; + let pk_b = pk_bytes(&pk); + let signature = sign_commitment(&sk, &root, 1, &[0; 32], &pk_b).unwrap(); + let c = StorageCommitment { + root: [1u8; 32], // tampered + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: pk_b, + signature, + }; + assert!(!verify_commitment_signature(&c)); + } + + #[test] + fn signature_fails_under_swapped_public_key() { + let dsa = ml_dsa_65(); + let (pk1, sk1) = dsa.generate_keypair().unwrap(); + let (pk2, _sk2) = dsa.generate_keypair().unwrap(); + let pk1_b = pk_bytes(&pk1); + let pk2_b = pk_bytes(&pk2); + // Sign under pk1 but embed pk2 — verification (using embedded key) + // should fail because pk2 didn't sign this payload AND because the + // signed payload binds pk1, not pk2. + let signature = sign_commitment(&sk1, &[0u8; 32], 1, &[0; 32], &pk1_b).unwrap(); + let c = StorageCommitment { + root: [0u8; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: pk2_b, + signature, + }; + assert!(!verify_commitment_signature(&c)); + } + + #[test] + fn signature_fails_with_garbage_bytes() { + let dsa = ml_dsa_65(); + let (pk, _sk) = dsa.generate_keypair().unwrap(); + let c = StorageCommitment { + root: [0u8; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: pk_bytes(&pk), + signature: vec![0u8; 100], // too short and zero-filled + }; + assert!(!verify_commitment_signature(&c)); + } + + #[test] + fn signature_fails_with_garbage_public_key() { + // Embedded pubkey is wrong length / invalid → from_bytes fails → + // verify returns false. Defends against malformed gossip. + let c = StorageCommitment { + root: [0u8; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: vec![0u8; 100], // wrong length + signature: vec![0u8; 3293], + }; + assert!(!verify_commitment_signature(&c)); + } + + #[test] + fn commitment_hash_differs_on_any_field_change() { + let dsa = ml_dsa_65(); + let (pk, sk) = dsa.generate_keypair().unwrap(); + let pk_b = pk_bytes(&pk); + let sig = sign_commitment(&sk, &[0; 32], 1, &[0; 32], &pk_b).unwrap(); + let c1 = StorageCommitment { + root: [0; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: pk_b, + signature: sig, + }; + let h1 = commitment_hash(&c1).unwrap(); + + let mut c2 = c1.clone(); + c2.root = [1; 32]; + assert_ne!(h1, commitment_hash(&c2).unwrap()); + + let mut c3 = c1.clone(); + c3.key_count = 2; + assert_ne!(h1, commitment_hash(&c3).unwrap()); + + let mut c4 = c1.clone(); + c4.sender_peer_id = [1; 32]; + assert_ne!(h1, commitment_hash(&c4).unwrap()); + + let mut c5 = c1.clone(); + c5.signature[0] ^= 1; + assert_ne!(h1, commitment_hash(&c5).unwrap()); + + let (pk_other, _) = dsa.generate_keypair().unwrap(); + let mut c6 = c1; + c6.sender_public_key = pk_bytes(&pk_other); + assert_ne!(h1, commitment_hash(&c6).unwrap()); + } + + #[test] + fn commitment_hash_stable_for_identical_input() { + let dsa = ml_dsa_65(); + let (pk, sk) = dsa.generate_keypair().unwrap(); + let pk_b = pk_bytes(&pk); + let sig = sign_commitment(&sk, &[7; 32], 42, &[3; 32], &pk_b).unwrap(); + let c = StorageCommitment { + root: [7; 32], + key_count: 42, + sender_peer_id: [3; 32], + sender_public_key: pk_b, + signature: sig, + }; + assert_eq!(commitment_hash(&c), commitment_hash(&c)); + } + + #[test] + fn commitment_hash_signature_length_change_changes_hash() { + // Postcard's varint length prefix means hashing a 1-byte signature + // and a 2-byte signature whose first byte is the same produces + // different commitment hashes — a hash that omitted the serialized + // length prefix would let boundary-shifted fields collide. + let c1 = StorageCommitment { + root: [0; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: vec![0u8; 1952], + signature: vec![0xAB], + }; + let c2 = StorageCommitment { + root: [0; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: vec![0u8; 1952], + signature: vec![0xAB, 0x00], + }; + assert_ne!(commitment_hash(&c1).unwrap(), commitment_hash(&c2).unwrap()); + } + + #[test] + fn too_many_keys_rejected() { + let mut entries = Vec::with_capacity(MAX_COMMITMENT_KEY_COUNT as usize + 1); + for i in 0..=MAX_COMMITMENT_KEY_COUNT { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_le_bytes()); + entries.push((k, [0; 32])); + } + let result = MerkleTree::build(entries); + assert!(matches!(result, Err(CommitmentError::TooManyKeys(_)))); + } +} diff --git a/src/replication/commitment_state.rs b/src/replication/commitment_state.rs new file mode 100644 index 00000000..15609727 --- /dev/null +++ b/src/replication/commitment_state.rs @@ -0,0 +1,1092 @@ +//! Responder-side commitment builder + rotation state. +//! +//! Phase 2b of the v12 storage-bound audit design. Builds, signs, and +//! caches a [`StorageCommitment`] over the responder's currently-stored +//! key set; serves audit lookups by `expected_commitment_hash`; retains +//! the previous commitment across one rotation so an audit pinned to it +//! does not false-fail at the rotation boundary (v5/v12 §4 retention). +//! +//! Rotation strategy: +//! +//! - `rotate(new_built)` atomically replaces `current` with `new_built` +//! and demotes the prior `current` to `previous`. The prior +//! `previous` is dropped. +//! - `lookup(hash)` reads the in-memory map and returns an [`Arc`] to +//! the matching `BuiltCommitment`, keeping it alive for the audit +//! response regardless of subsequent rotation (mirrors the `ArcSwap` +//! semantics specified in v6 §2: an in-flight reader holding its +//! `Arc` is unaffected by a concurrent rotate). +//! +//! No persistent disk state. Trees are rebuilt from `LmdbStorage` at +//! the next rotation tick. Memory cost is bounded by +//! `2 × (key_count × ~64 bytes + signature_size)` — for 10k keys, ~1.3 MB. + +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use parking_lot::RwLock; +use saorsa_pqc::api::sig::MlDsaSecretKey; + +use crate::ant_protocol::XorName; +use crate::replication::commitment::{ + commitment_hash, sign_commitment, CommitmentError, MerkleTree, StorageCommitment, +}; + +/// Auditor-side per-peer commitment state. +/// +/// Holds two things that together implement v10/v12 §2 step 5 and §6: +/// - `last_commitment`: the most recently received, verified, signed +/// commitment from this peer. `None` if we've evicted it (TTL, +/// sybil cap, peer-removed) or never received one. +/// - `commitment_capable`: a **sticky** boolean that flips to `true` +/// on the first successful gossip ingest and NEVER reverts. Used +/// by holder-eligibility (§6) and bootstrap-claim shield: a peer +/// that has at least once proven it speaks v12 is forever held to +/// that standard. Without stickiness, a peer could flip the flag +/// off by silencing its gossip and downgrade to the weaker legacy +/// audit path. +#[derive(Debug, Clone)] +pub struct PeerCommitmentRecord { + /// Last verified commitment, or `None` if evicted/expired. + pub last_commitment: Option, + /// Sticky: true once this peer has gossiped a valid commitment. + /// Set on ingest. Never set back to false except by full + /// `PeerRemoved` cleanup. + pub commitment_capable: bool, + /// When `last_commitment` was received. Used for TTL on the + /// commitment itself (independent of the `commitment_capable` + /// stickiness — losing the commitment via TTL doesn't make us + /// forget the peer ever spoke v12). + pub received_at: Instant, + /// Last time we performed an ML-DSA signature verify for this + /// peer's commitment. Used to enforce the §2 step 3 rate limit + /// (at most one sig verify per peer per 60s). + pub last_sig_verify_at: Instant, +} + +impl PeerCommitmentRecord { + /// Construct from a freshly-verified commitment. `commitment_capable` + /// is set to `true` here and must remain so for the lifetime of the + /// record. + #[must_use] + pub fn from_verified(commitment: StorageCommitment, now: Instant) -> Self { + Self { + last_commitment: Some(commitment), + commitment_capable: true, + received_at: now, + last_sig_verify_at: now, + } + } + + /// Mark commitment-capable without storing a commitment (used when + /// we've TTL-expired the commitment itself but want to remember the + /// peer has spoken v12 before). + #[must_use] + pub fn capable_but_no_commitment(now: Instant) -> Self { + Self { + last_commitment: None, + commitment_capable: true, + received_at: now, + last_sig_verify_at: now, + } + } +} + +/// A fully-built commitment: signed wire blob, cached hash, Merkle tree +/// for inclusion proofs, and a sorted leaf-index lookup for the auditor's +/// `leaf_index` field. +/// +/// Held inside an [`Arc`] so audit responders can grab a reference and +/// build a reply without holding the [`ResponderCommitmentState`] read +/// lock for the duration of the response. +pub struct BuiltCommitment { + /// The signed wire blob. + commitment: StorageCommitment, + /// `commitment_hash(commitment)` — cached so audit lookups don't + /// re-serialize on every match. + cached_hash: [u8; 32], + /// The Merkle tree behind the commitment. `path_for(key)` produces + /// the inclusion proof; the responder's leaf-index lookup is below. + tree: MerkleTree, + /// `sorted_keys[i]` is the key at leaf index `i`. Sorted ascending + /// so binary search reconstructs `leaf_index` for any key in + /// `O(log n)`. + sorted_keys: Vec, +} + +impl BuiltCommitment { + /// Build a commitment over `entries = [(key, bytes_hash), ...]` and + /// sign it with `secret_key`. + /// + /// `entries` does not need to be sorted (the inner [`MerkleTree`] + /// sorts internally); `sender_peer_id` is bound into the signature + /// and the commitment. + /// + /// # Errors + /// + /// Returns the wrapped [`CommitmentError`] on empty key sets, + /// over-cap key counts, duplicates, or signing failures. + pub fn build( + entries: Vec<(XorName, [u8; 32])>, + sender_peer_id: &[u8; 32], + secret_key: &MlDsaSecretKey, + sender_public_key: &[u8], + ) -> Result { + let tree = MerkleTree::build(entries)?; + let root = tree.root(); + let key_count = tree.key_count(); + let signature = sign_commitment( + secret_key, + &root, + key_count, + sender_peer_id, + sender_public_key, + )?; + let commitment = StorageCommitment { + root, + key_count, + sender_peer_id: *sender_peer_id, + sender_public_key: sender_public_key.to_vec(), + signature, + }; + // `commitment_hash` only returns None on a postcard serialization + // failure, which for our fixed-size commitment cannot occur in + // practice (ML-DSA-65 signature is 3293 bytes). If it ever + // somehow does, surface as a SignatureFailed so callers don't + // need a new error variant for an unreachable case. + let cached_hash = commitment_hash(&commitment).ok_or_else(|| { + CommitmentError::SignatureFailed("commitment serialization failed".to_string()) + })?; + // Recover the sorted key list from the tree (path_for uses + // binary search internally, but we need an explicit list for + // leaf_index lookup at audit time). + let sorted_keys: Vec = tree.sorted_keys(); + Ok(Self { + commitment, + cached_hash, + tree, + sorted_keys, + }) + } + + /// The signed wire blob. + #[must_use] + pub fn commitment(&self) -> &StorageCommitment { + &self.commitment + } + + /// The cached commitment hash. Equal to + /// [`crate::replication::commitment::commitment_hash`] + /// `(self.commitment())`. + #[must_use] + pub fn hash(&self) -> [u8; 32] { + self.cached_hash + } + + /// The Merkle tree behind this commitment. + /// + /// Used by the subtree-audit responder to plan a proof (select the + /// nonce-determined branch and read its sibling cut-hashes). + #[must_use] + pub fn tree(&self) -> &MerkleTree { + &self.tree + } + + /// Inclusion path + leaf index for `key`, if it is in this + /// commitment. Returns `None` if `key` is not committed. + #[must_use] + pub fn proof_for(&self, key: &XorName) -> Option<(Vec<[u8; 32]>, u32)> { + let idx = self.sorted_keys.binary_search(key).ok()?; + let path = self.tree.path_for(key)?; + // u32 cast safe because MerkleTree::build rejects > MAX_COMMITMENT_KEY_COUNT. + let leaf_index = u32::try_from(idx).unwrap_or(u32::MAX); + Some((path, leaf_index)) + } + + /// Whether `key` is committed in this tree. Allocation-free membership + /// check (binary search over the sorted leaf keys) — equivalent to + /// `proof_for(key).is_some()` but without building the inclusion path, for + /// hot callers (e.g. the pruner's `is_held` veto) that only need the + /// boolean. + #[must_use] + pub fn contains_key(&self, key: &XorName) -> bool { + self.sorted_keys.binary_search(key).is_ok() + } +} + +/// Number of recently-gossiped commitments a responder stays answerable for +/// (ADR-0002 "you stay answerable for what you publish"). +/// +/// The auditor only ever pins a commitment it received via gossip, so retaining +/// the last two **actually-gossiped** commitments (plus the current one) +/// guarantees an honest node can always answer a pin the auditor could have +/// formed. Two — not one — absorbs the race where the auditor pins the +/// commitment a node published just before its newest one. Retention is keyed on +/// gossip emission, NOT on the rotation timer: a node that rebuilds its tree +/// faster than it gossips never drops a commitment it actually put on the wire, +/// so it is never wrongly failed for "unknown commitment hash". +const RETAINED_GOSSIPED_COMMITMENTS: usize = 2; + +/// How long a gossiped commitment stays answerable after it was last put on the +/// wire. Retention (and therefore the pruner's `is_held` deletion veto) is +/// anchored to gossip emission, not to the rotation timer or to distinct-hash +/// churn: a commitment record expires this long after its last `mark_gossiped`, +/// even if the node keeps re-gossiping nothing new (the steady-state no-op +/// rotation case) or stops being responsible for all its keys. +/// +/// Sized so it strictly dominates the longest realistic auditor pin lifetime — +/// well above the neighbor-sync gossip cadence and per-peer cooldown (≤1 h) — +/// while staying far below the prune hysteresis (days), so once a stale key +/// stops being gossiped the pruner reclaims it promptly. At +/// `RETAINED_GOSSIPED_COMMITMENTS = 2` this is `(2 + 1) ×` the 1 h rotation +/// interval = 3 h. +pub(crate) const GOSSIP_ANSWERABILITY_TTL: Duration = Duration::from_secs(3 * 3600); + +/// Responder retention state (ADR-0002). +/// +/// Keeps the current (latest-rotated) commitment plus every commitment whose +/// hash is among the last `RETAINED_GOSSIPED_COMMITMENTS` *gossiped* hashes. +/// A built-but-never-gossiped commitment is dropped on the next rotation unless +/// it gets gossiped. Rotation and gossip are the only paths that mutate this. +pub struct ResponderCommitmentState { + inner: RwLock, +} + +/// A commitment hash that was emitted on the wire, with the wall-clock time it +/// was last gossiped. The `last_gossiped_at` is the answerability anchor: the +/// record (and any slot it retains) expires `GOSSIP_ANSWERABILITY_TTL` after +/// this instant, independent of rotation ticks or distinct-hash churn. +#[derive(Clone, Copy)] +struct GossipedAt { + hash: [u8; 32], + last_gossiped_at: Instant, +} + +struct Inner { + /// Newest-first. When `has_current` is true, `slots[0]` is the current + /// (advertised) commitment; the rest — and, once retired, `slots[0]` too — + /// are retained only because their hash is still in `recently_gossiped` and + /// not yet expired. + slots: Vec>, + /// Whether `slots[0]` is the live, advertised current commitment. Set by + /// `rotate`; cleared by `retire_current` (and when the slot set empties). + /// When false, `current()` returns `None` — the node stops advertising and + /// re-gossiping the stale root, so it ages out by its gossip TTL — while + /// `lookup_by_hash` still answers any in-flight pin until then. This + /// decouples ADVERTISE (gossiped as current, refreshes the TTL) from ANSWER + /// (still resolvable during the TTL window). + has_current: bool, + /// The last `RETAINED_GOSSIPED_COMMITMENTS` commitments actually emitted on + /// the wire, newest-first, each stamped with when it was last gossiped. A + /// commitment is retained iff it is the live current one or its hash appears + /// here with an unexpired stamp. + recently_gossiped: Vec, +} + +impl Default for ResponderCommitmentState { + fn default() -> Self { + Self::new() + } +} + +impl ResponderCommitmentState { + /// Empty state: no commitments yet. Audits before the first rotation + /// see `None` lookups and the auditor falls back to the legacy plain + /// digest path. + #[must_use] + pub fn new() -> Self { + Self { + inner: RwLock::new(Inner { + slots: Vec::with_capacity(RETAINED_GOSSIPED_COMMITMENTS + 1), + has_current: false, + recently_gossiped: Vec::with_capacity(RETAINED_GOSSIPED_COMMITMENTS), + }), + } + } + + /// Rotate: the freshly-rebuilt commitment becomes `current`. Slots that are + /// neither the new current nor among the last gossiped hashes are dropped + /// (a built-but-never-gossiped commitment does not linger). + pub fn rotate(&self, new_current: BuiltCommitment) { + let new_current = Arc::new(new_current); + let mut guard = self.inner.write(); + guard.slots.insert(0, new_current); + guard.has_current = true; + prune_slots(&mut guard, Instant::now()); + } + + /// Retire the current commitment WITHOUT clearing retention: stop + /// advertising it (so `current()` returns `None`, the gossip-emit sites stop + /// re-emitting and re-stamping it, and it can age out by its gossip TTL), + /// while keeping it answerable via `lookup_by_hash` for any in-flight pin a + /// peer already formed — until that pin's gossip stamp expires. + /// + /// Called when the node has no key it is still responsible for: it must no + /// longer claim to hold that data going forward, but must not strand a peer + /// mid-audit on a root it gossiped moments ago. A never-gossiped current is + /// simply dropped (nothing to stay answerable for). + pub fn retire_current(&self) { + let mut guard = self.inner.write(); + guard.has_current = false; + prune_slots(&mut guard, Instant::now()); + } + + /// Record that `hash` was emitted on the wire (gossiped). Keeps the last + /// `RETAINED_GOSSIPED_COMMITMENTS` gossiped hashes so the matching + /// commitments stay answerable (ADR-0002). Call at every gossip-emit site. + /// + /// Re-gossiping a hash already present **refreshes** its answerability + /// deadline to now and moves it to the front: every time the node actually + /// puts a root on the wire — including re-emitting the current root in the + /// steady-state no-op-rotation case — its retention legitimately extends. + /// Conversely a root that stops being gossiped expires + /// `GOSSIP_ANSWERABILITY_TTL` after its last emission, which is what lets + /// an out-of-range key age out even when the no-op guard freezes the + /// committed key set. + pub fn mark_gossiped(&self, hash: [u8; 32]) { + let now = Instant::now(); + let mut guard = self.inner.write(); + mark_gossiped_locked(&mut guard, hash, now); + } + + /// Atomically snapshot the current commitment to advertise AND mark it + /// gossiped, under a single lock. Returns the commitment to put on the wire, + /// or `None` if there is no live current (never rotated, or retired). + /// + /// This is the ONLY correct way to gossip the current commitment: doing + /// `current()` then a separate `mark_gossiped()` is a TOCTOU — a concurrent + /// `retire_current`/`rotate` between the two could drop the slot, so the node + /// would emit a root the responder no longer retains (a peer pinning it would + /// get "unknown commitment hash" → false failure). Taking the snapshot and + /// the stamp in one critical section guarantees anything emitted is + /// simultaneously retained for its answerability TTL. + #[must_use] + pub fn current_for_gossip(&self) -> Option> { + let now = Instant::now(); + let mut guard = self.inner.write(); + if !guard.has_current { + return None; + } + let current = guard.slots.first().map(Arc::clone)?; + mark_gossiped_locked(&mut guard, current.cached_hash, now); + Some(current) + } + + /// Expire retention purely by the wall clock, without building, signing, or + /// rotating anything. Call once per rotation tick so a gossiped commitment's + /// answerability deadline advances even when the rotation no-op guard + /// returns early (unchanged committed set) or when the node has no + /// responsible keys to commit to. This is the time-driven half of the + /// retention contract — without it, a frozen `recently_gossiped` entry would + /// keep a stale key `is_held` forever. + pub fn age_out(&self) { + let mut guard = self.inner.write(); + prune_slots(&mut guard, Instant::now()); + } + + /// Look up a commitment by its hash. Returns `Some(arc)` if `hash` + /// matches any retained slot. The returned `Arc` keeps the + /// [`BuiltCommitment`] alive for as long as the caller holds it, + /// even if a concurrent `rotate` ages it out of the retention buffer. + #[must_use] + pub fn lookup_by_hash(&self, hash: &[u8; 32]) -> Option> { + let guard = self.inner.read(); + for c in &guard.slots { + if &c.cached_hash == hash { + return Some(Arc::clone(c)); + } + } + None + } + + /// Whether `key` is committed under any retained slot (the current + /// commitment plus the last-2-gossiped ones) — i.e. whether a peer could + /// still pin a recently gossiped root and demand this key's bytes in a + /// round-2 byte challenge. + /// + /// This is the SAME predicate the round-2 responder uses to decide a key is + /// "committed" (`handle_subtree_byte_challenge` calls `built.proof_for(key)` + /// on the pinned slot, which is committed iff `contains_key`), folded over + /// every retained slot. The pruner consults it before deleting an + /// out-of-range key, so "the pruner will not delete it" and "the responder + /// still owes an answer for it" are provably the same boolean and cannot + /// drift. `slots` holds at most `RETAINED_GOSSIPED_COMMITMENTS` + 1 + /// commitments, and `contains_key` is an allocation-free binary search, so + /// this is a short, allocation-free read. + #[must_use] + pub fn is_held(&self, key: &XorName) -> bool { + self.inner.read().slots.iter().any(|c| c.contains_key(key)) + } + + /// Snapshot the current commitment to ADVERTISE, if any. Used by the gossip + /// piggyback path: emit `state.current()` on the next outbound + /// `NeighborSyncRequest`/`Response`. Returns `None` once the current + /// commitment has been retired (the node has no responsible keys), so the + /// node stops re-gossiping a stale root even though `lookup_by_hash` may + /// still answer it during its remaining TTL. + #[must_use] + pub fn current(&self) -> Option> { + let guard = self.inner.read(); + if guard.has_current { + guard.slots.first().map(Arc::clone) + } else { + None + } + } + + /// Number of commitment slots currently retained (the current commitment + /// plus any still-answerable recently-gossiped ones). Used only for the + /// v12 `commitment_rotated` event's `retained_slots` field; carries no + /// behavioural meaning. + #[must_use] + pub fn retained_slot_count(&self) -> usize { + self.inner.read().slots.len() + } + + /// Drop every retained slot. Called when the local store has + /// transitioned to empty: keeping the previously-advertised + /// commitment alive would invite audit failures (we can no longer + /// answer for any of the keys we committed to), and would leave + /// remote auditors pinning a hash this node will never satisfy + /// again. After clearing, the gossip piggyback path will emit + /// `commitment: None` until a fresh rotation occurs. + /// + /// This is the one sanctioned escape from the "callers MUST NOT + /// clear retention by any other mechanism" invariant — empty + /// storage means there is nothing to retain. + pub fn clear_all(&self) { + let mut guard = self.inner.write(); + guard.slots.clear(); + guard.has_current = false; + guard.recently_gossiped.clear(); + } +} + +/// Enforce retention as of `now`: first expire any gossip record older than +/// `GOSSIP_ANSWERABILITY_TTL`, then keep the live current slot (only while +/// `has_current`) and any slot whose hash is still among the unexpired +/// recently-gossiped hashes; drop the rest. Idempotent; preserves newest-first +/// order. This is the single place retention is enforced. +/// +/// The current-slot exemption is conditional on `has_current`: once the current +/// commitment is retired (no responsible keys), `slots[0]` is no longer exempt +/// and ages out by its own gossip TTL exactly like any other retained slot — +/// the fix that stops a stale, continuously-re-gossiped current from pinning its +/// keys forever. +/// Stamp `hash` as gossiped at `now` (newest-first, de-duplicated, bounded to +/// `RETAINED_GOSSIPED_COMMITMENTS`) and re-run retention. Shared by +/// `mark_gossiped` and `current_for_gossip` so the snapshot-and-stamp can be one +/// critical section. +fn mark_gossiped_locked(inner: &mut Inner, hash: [u8; 32], now: Instant) { + inner.recently_gossiped.retain(|g| g.hash != hash); + inner.recently_gossiped.insert( + 0, + GossipedAt { + hash, + last_gossiped_at: now, + }, + ); + inner + .recently_gossiped + .truncate(RETAINED_GOSSIPED_COMMITMENTS); + prune_slots(inner, now); +} + +fn prune_slots(inner: &mut Inner, now: Instant) { + // 1. TTL-expire gossip records first (the answerability anchor). A record + // whose last gossip is older than the window no longer keeps anything + // answerable, regardless of distinct-hash churn or rotation ticks. + inner + .recently_gossiped + .retain(|g| now.duration_since(g.last_gossiped_at) < GOSSIP_ANSWERABILITY_TTL); + + // 2. Keep the live current slot (only while has_current) + any slot still + // covered by an unexpired record. Snapshot the live hashes first to avoid + // borrowing `inner` twice (both collections are at most + // RETAINED_GOSSIPED_COMMITMENTS + 1 long). + let live: Vec<[u8; 32]> = inner.recently_gossiped.iter().map(|g| g.hash).collect(); + let has_current = inner.has_current; + let mut idx = 0usize; + inner.slots.retain(|c| { + let keep = (has_current && idx == 0) || live.contains(&c.cached_hash); + idx += 1; + keep + }); + // If nothing remains, there is no current slot to advertise. + if inner.slots.is_empty() { + inner.has_current = false; + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::*; + use crate::replication::commitment::{commitment_hash, leaf_hash, verify_path}; + use saorsa_pqc::api::sig::ml_dsa_65; + + fn key(byte: u8) -> XorName { + let mut k = [0u8; 32]; + k[0] = byte; + k + } + + fn bh(byte: u8) -> [u8; 32] { + [byte ^ 0x5A; 32] + } + + fn keypair() -> (saorsa_pqc::api::sig::MlDsaPublicKey, MlDsaSecretKey) { + ml_dsa_65().generate_keypair().unwrap() + } + + #[test] + fn built_commitment_hash_matches_global_hash() { + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let entries: Vec<_> = (1..=5u8).map(|i| (key(i), bh(i))).collect(); + let built = BuiltCommitment::build(entries, &[0xAB; 32], &sk, &pk_bytes).unwrap(); + let expected = commitment_hash(built.commitment()).unwrap(); + assert_eq!(built.hash(), expected); + } + + #[test] + fn built_commitment_proof_verifies_under_its_own_root() { + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let entries: Vec<_> = (1..=8u8).map(|i| (key(i), bh(i))).collect(); + let built = BuiltCommitment::build(entries.clone(), &[1; 32], &sk, &pk_bytes).unwrap(); + let root = built.commitment().root; + let key_count = built.commitment().key_count; + + for (k, _) in &entries { + let (path, leaf_index) = built.proof_for(k).expect("present"); + // Find the bytes_hash for this key. + let bh_k = entries.iter().find(|(kk, _)| kk == k).unwrap().1; + let lh = leaf_hash(k, &bh_k); + assert!( + verify_path(&lh, &path, leaf_index as usize, key_count, &root), + "path verify failed for key {k:?}" + ); + } + } + + #[test] + fn proof_for_absent_key_is_none() { + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let built = BuiltCommitment::build( + vec![(key(1), bh(1)), (key(2), bh(2))], + &[0; 32], + &sk, + &pk_bytes, + ) + .unwrap(); + assert!(built.proof_for(&key(99)).is_none()); + } + + #[test] + fn empty_state_returns_none() { + let state = ResponderCommitmentState::new(); + assert!(state.current().is_none()); + assert!(state.lookup_by_hash(&[0; 32]).is_none()); + } + + #[test] + fn clear_all_drops_every_slot() { + // Empty-storage transition: after clear_all, the gossip path + // must observe `current() == None` so it stops piggybacking a + // commitment the node can no longer answer audits against. + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + let peer_id = *blake3::hash(&pk.to_bytes()).as_bytes(); + + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &peer_id, &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); // gossiped → retained across the next rotation + let c2 = BuiltCommitment::build(vec![(key(2), bh(2))], &peer_id, &sk, &pk_bytes).unwrap(); + let h2 = c2.hash(); + state.rotate(c2); + state.mark_gossiped(h2); + + assert!(state.current().is_some()); + assert!(state.lookup_by_hash(&h1).is_some()); + + state.clear_all(); + + assert!(state.current().is_none()); + assert!(state.lookup_by_hash(&h1).is_none()); + } + + #[test] + fn lookup_arc_outlives_subsequent_rotation() { + // INV-R2: an in-flight audit responder that grabbed an Arc must + // be able to finish building the response even after the state + // rotates that commitment out past the retention window. + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + + let in_flight = state.lookup_by_hash(&h1).unwrap(); + + // c1 was never gossiped, so the next rotation (a new current) drops it + // from the retention buffer. + let c2 = BuiltCommitment::build(vec![(key(2), bh(2))], &[0; 32], &sk, &pk_bytes).unwrap(); + state.rotate(c2); + assert!(state.lookup_by_hash(&h1).is_none()); + + // But the in-flight Arc still works (INV: Arc keeps it alive). + assert_eq!(in_flight.hash(), h1); + assert!(in_flight.proof_for(&key(1)).is_some()); + } + + #[test] + fn gossiped_commitment_stays_answerable_across_rotations() { + // ADR-0002: a commitment that was actually gossiped stays answerable + // even after rotation, until it falls out of the last-2-gossiped window. + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); // we put c1 on the wire + + // Rotate to c2 and gossip it. c1 is still within the last-2-gossiped. + let c2 = BuiltCommitment::build(vec![(key(2), bh(2))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h2 = c2.hash(); + state.rotate(c2); + state.mark_gossiped(h2); + assert!( + state.lookup_by_hash(&h1).is_some(), + "c1 must stay answerable" + ); + assert!(state.lookup_by_hash(&h2).is_some()); + + // Rotate to c3 and gossip it. Now the last-2-gossiped are {h3, h2}; + // h1 has fallen out of the window and is dropped. + let c3 = BuiltCommitment::build(vec![(key(3), bh(3))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h3 = c3.hash(); + state.rotate(c3); + state.mark_gossiped(h3); + assert!( + state.lookup_by_hash(&h1).is_none(), + "c1 aged out of gossip window" + ); + assert!(state.lookup_by_hash(&h2).is_some()); + assert!(state.lookup_by_hash(&h3).is_some()); + } + + #[test] + fn current_plus_last_two_gossiped_are_simultaneously_answerable() { + // ADR-0002 "Two, not one": the retention depth must keep BOTH of the + // last two gossiped commitments answerable at the same time, alongside + // the current one. This is the property that "absorbs the race where an + // auditor asks about the commitment a node published just before its + // newest one". The existing across-rotations test only ever checks two + // hashes at once; this one proves three DISTINCT commitments are live + // simultaneously and that the third-oldest gossiped root is dropped — + // i.e. RETAINED_GOSSIPED_COMMITMENTS is exactly 2, not 1 and not 3. + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + + // Gossip three commitments in order: c1, c2, c3. After this the current + // slot is c3 and the last-two-gossiped are {h3, h2}. But c2 and c1 also + // need to be checked relative to the window: once c3 is gossiped, the + // window is {h3, h2}; c1 (the 3rd-oldest gossiped) must be gone. + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); + + let c2 = BuiltCommitment::build(vec![(key(2), bh(2))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h2 = c2.hash(); + state.rotate(c2); + state.mark_gossiped(h2); + + // At this moment: current = c2, last-2-gossiped = {h2, h1}. Both the + // current AND the previously-gossiped c1 must be answerable — the "two, + // not one" race window. c1 is the commitment "published just before the + // newest one" and an auditor may still pin it. + assert!( + state.lookup_by_hash(&h1).is_some(), + "the commitment published just before the newest one must stay answerable" + ); + assert!( + state.lookup_by_hash(&h2).is_some(), + "current must be answerable" + ); + assert_ne!(h1, h2, "the two retained commitments must be distinct"); + + // Now gossip a third distinct commitment c3. Window becomes {h3, h2}. + // c3 (current) + c2 + c1: c1 must now be dropped (3rd-oldest gossiped), + // while c2 and c3 remain. This proves depth is exactly 2 beyond... no: + // depth is 2 gossiped TOTAL including current's hash once gossiped. + let c3 = BuiltCommitment::build(vec![(key(3), bh(3))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h3 = c3.hash(); + state.rotate(c3); + state.mark_gossiped(h3); + + assert_ne!(h2, h3); + assert_ne!(h1, h3); + assert!( + state.lookup_by_hash(&h3).is_some(), + "current (c3) answerable" + ); + assert!( + state.lookup_by_hash(&h2).is_some(), + "c2 (published just before newest) answerable — the race-absorbing slot" + ); + assert!( + state.lookup_by_hash(&h1).is_none(), + "c1 is the 3rd-oldest gossiped root and MUST be dropped — depth is exactly 2" + ); + } + + #[test] + fn is_held_tracks_keys_across_the_retention_window_and_ages_them_out() { + // The pruner's deletion veto relies on `is_held`: a key committed under + // ANY retained slot (current + last-2-gossiped) must read held, and must + // stop reading held once its commitment ages out of the window — that is + // the bounded reprieve, not a permanent pin. This mirrors the + // round-2 responder's `built.proof_for(key).is_some()` check folded over + // the slots, so "pruner won't delete" == "responder owes an answer". + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + + // c1 commits to key(1). Gossip it -> key(1) is held (current slot). + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); + assert!( + state.is_held(&key(1)), + "freshly committed+gossiped key is held" + ); + assert!(!state.is_held(&key(99)), "never-committed key is not held"); + + // c2 commits to key(2) only (key(1) dropped from the new commitment, + // e.g. it went out of range). key(1) must STILL be held via the retained + // previous gossiped slot (the race-absorbing window), and key(2) too. + let c2 = BuiltCommitment::build(vec![(key(2), bh(2))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h2 = c2.hash(); + state.rotate(c2); + state.mark_gossiped(h2); + assert!( + state.is_held(&key(1)), + "key dropped from the newest commitment is still held via the previous gossiped slot" + ); + assert!(state.is_held(&key(2)), "newly committed key is held"); + + // c3 commits to key(3). Window becomes {h3, h2}; h1 ages out, so key(1) + // is no longer held anywhere -> the pruner may now reclaim it. + let c3 = BuiltCommitment::build(vec![(key(3), bh(3))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h3 = c3.hash(); + state.rotate(c3); + state.mark_gossiped(h3); + assert!( + !state.is_held(&key(1)), + "key whose commitments all aged out of the retention window is no longer held" + ); + assert!( + state.is_held(&key(2)), + "key(2) still held via the previous gossiped slot" + ); + assert!(state.is_held(&key(3)), "current key held"); + } + + /// Build a `BuiltCommitment` over the given keys for use in raw `prune_slots` + /// tests (each key's `bytes_hash` is `bh(k[0])`). + fn built(keys: &[u8]) -> BuiltCommitment { + let (pk, sk) = keypair(); + let entries: Vec<_> = keys.iter().map(|&b| (key(b), bh(b))).collect(); + BuiltCommitment::build(entries, &[0; 32], &sk, &pk.to_bytes()).unwrap() + } + + #[test] + fn stale_gossip_record_expires_by_ttl_even_without_new_distinct_gossip() { + // Frozen-retention-window regression: the no-op-rotation guard can freeze + // `recently_gossiped` (no new distinct hash is ever gossiped once the + // responsible key set stabilizes). The retention window must still age a + // stale gossiped commitment out by the WALL CLOCK, so its key stops + // being `is_held` and the pruner can reclaim it. Driven directly through + // `prune_slots(now)` with a synthetic clock so it is deterministic. + let c_current = Arc::new(built(&[1])); // root over key(1) — current + let c_stale = Arc::new(built(&[2])); // root over key(2) — out-of-range, only retained via gossip + let h_current = c_current.hash(); + let h_stale = c_stale.hash(); + + // Synthetic clock: stamps anchor at `base` and the prune evaluates at a + // FUTURE `now` (adding to an `Instant` never underflows, unlike + // subtracting a TTL from a fresh Windows monotonic clock). The stale + // record was last gossiped just over the TTL before `now`; the current + // record was gossiped at `now`. This is exactly the frozen-window state: + // current keeps being re-gossiped (refreshing its stamp) while the stale + // root is never gossiped again. + let base = Instant::now(); + let now = base + GOSSIP_ANSWERABILITY_TTL + Duration::from_secs(1); + let mut inner = Inner { + slots: vec![Arc::clone(&c_current), Arc::clone(&c_stale)], + has_current: true, + recently_gossiped: vec![ + GossipedAt { + hash: h_current, + last_gossiped_at: now, + }, + GossipedAt { + hash: h_stale, + last_gossiped_at: base, + }, + ], + }; + + prune_slots(&mut inner, now); + + // The stale record (and its slot) must be gone; the current one stays. + assert!( + inner.recently_gossiped.iter().all(|g| g.hash != h_stale), + "stale gossip record past its TTL must expire" + ); + assert_eq!(inner.slots.len(), 1, "the stale slot must be dropped"); + assert_eq!(inner.slots[0].hash(), h_current, "current slot retained"); + // key(2) — committed only under the now-expired stale slot — is no + // longer held, so the pruner may reclaim it. key(1) stays held. + assert!( + inner.slots.iter().all(|c| c.proof_for(&key(2)).is_none()), + "stale key is no longer held once its commitment ages out" + ); + assert!( + inner.slots.iter().any(|c| c.proof_for(&key(1)).is_some()), + "current key still held" + ); + } + + #[test] + fn recent_gossip_record_stays_answerable_within_ttl() { + // Early-drop regression: a commitment gossiped recently (within the TTL) + // must remain answerable even if it is no longer the current root — a + // peer may still have pinned it. `prune_slots` must NOT drop it early. + let c_current = Arc::new(built(&[1])); + let c_prev = Arc::new(built(&[2])); + let h_current = c_current.hash(); + let h_prev = c_prev.hash(); + + // Synthetic clock (forward-only, see the stale-expiry test above). + let base = Instant::now(); + let now = base + GOSSIP_ANSWERABILITY_TTL / 2; + let mut inner = Inner { + slots: vec![Arc::clone(&c_current), Arc::clone(&c_prev)], + has_current: true, + recently_gossiped: vec![ + GossipedAt { + hash: h_current, + last_gossiped_at: now, + }, + GossipedAt { + // Gossiped a while ago, but still comfortably within the TTL. + hash: h_prev, + last_gossiped_at: base, + }, + ], + }; + + prune_slots(&mut inner, now); + + assert_eq!( + inner.slots.len(), + 2, + "a commitment gossiped within the TTL must stay answerable (the 'two, not one' race window)" + ); + assert!( + inner.slots.iter().any(|c| c.hash() == h_prev), + "the recently-gossiped previous commitment must not be dropped early" + ); + } + + #[test] + fn retire_current_hides_current_but_keeps_recent_pin_answerable() { + // Retire-current regression: retiring the current commitment (no responsible + // keys) must STOP advertising it (current() -> None, so the gossip loop + // stops re-stamping it) while keeping it answerable for an in-flight pin. + let state = ResponderCommitmentState::new(); + let c1 = built(&[1]); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); + + assert!(state.current().is_some(), "fresh current is advertised"); + + state.retire_current(); + + assert!( + state.current().is_none(), + "retired current must not be advertised (stops the gossip loop re-stamping it)" + ); + assert!( + state.lookup_by_hash(&h1).is_some(), + "retired current stays answerable for an in-flight pin within its TTL" + ); + assert!( + state.is_held(&key(1)), + "its keys are still held while answerable, so the pruner still vetoes them" + ); + } + + #[test] + fn retired_current_ages_out_by_gossip_ttl() { + // The retired current must age out by its gossip TTL — the exact fix for + // the stale-current permanent pin: its record is never refreshed (not + // advertised), so once the TTL lapses prune_slots drops it. + let c1 = Arc::new(built(&[1])); + let h1 = c1.hash(); + // Synthetic clock (forward-only, see the stale-expiry test above). + let base = Instant::now(); + let now = base + GOSSIP_ANSWERABILITY_TTL + Duration::from_secs(1); + let mut inner = Inner { + slots: vec![Arc::clone(&c1)], + has_current: false, // already retired + recently_gossiped: vec![GossipedAt { + hash: h1, + last_gossiped_at: base, + }], + }; + + prune_slots(&mut inner, now); + + assert!( + inner.slots.is_empty(), + "retired current past its TTL is dropped" + ); + assert!(!inner.has_current); + assert!( + inner.slots.iter().all(|c| c.proof_for(&key(1)).is_none()), + "its key is no longer held -> pruner reclaims it" + ); + } + + #[test] + fn retired_current_stays_answerable_within_ttl() { + // A retired current within its TTL must remain answerable (not dropped). + let c1 = Arc::new(built(&[1])); + let h1 = c1.hash(); + // Synthetic clock (forward-only, see the stale-expiry test above). + let base = Instant::now(); + let now = base + GOSSIP_ANSWERABILITY_TTL / 2; + let mut inner = Inner { + slots: vec![Arc::clone(&c1)], + has_current: false, // retired + recently_gossiped: vec![GossipedAt { + hash: h1, + last_gossiped_at: base, + }], + }; + + prune_slots(&mut inner, now); + + assert_eq!( + inner.slots.len(), + 1, + "retired-but-recent current stays answerable" + ); + assert_eq!(inner.slots[0].hash(), h1); + } + + #[test] + fn re_acquire_after_retire_advertises_fresh_current_without_resurrecting_stale() { + // Re-acquire path: a node retires its current (went out of range), then + // becomes responsible again and rotates a fresh commitment. The fresh + // one must become the advertised current; the retired one must only + // linger as a retained (answerable) slot if still gossiped+unexpired, + // never resurrect as current. + let state = ResponderCommitmentState::new(); + let c1 = built(&[1]); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); // gossiped, so it stays answerable after retire + state.retire_current(); + assert!(state.current().is_none()); + + // Become responsible again: rotate a fresh commitment. + let c2 = built(&[2]); + let h2 = c2.hash(); + state.rotate(c2); + state.mark_gossiped(h2); + + let cur = state + .current() + .expect("fresh current advertised after re-acquire"); + assert_eq!( + cur.hash(), + h2, + "the FRESH commitment is current, not the retired one" + ); + assert!( + state.lookup_by_hash(&h1).is_some(), + "the retired-but-recently-gossiped commitment is still answerable as a retained slot" + ); + assert!( + state.is_held(&key(1)), + "retired key still held within its TTL" + ); + assert!(state.is_held(&key(2)), "fresh current key held"); + } + + #[test] + fn retire_current_drops_ungossiped_current() { + // A current that was never gossiped has nothing to stay answerable for, + // so retiring it drops it outright (no lookup, no current). + let state = ResponderCommitmentState::new(); + let c1 = built(&[1]); + let h1 = c1.hash(); + state.rotate(c1); // built but NOT gossiped + + state.retire_current(); + + assert!(state.current().is_none(), "no current after retire"); + assert!( + state.lookup_by_hash(&h1).is_none(), + "an ungossiped retired current is not answerable (nothing to retain)" + ); + assert!(!state.is_held(&key(1))); + } + + #[test] + fn ungossiped_rebuild_does_not_evict_gossiped_commitment() { + // The rebuild-faster-than-gossip case: a node rebuilds (rotates) several + // times without gossiping. The last *gossiped* commitment must remain + // answerable so the node is not wrongly failed for "unknown hash". + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let state = ResponderCommitmentState::new(); + + let c1 = BuiltCommitment::build(vec![(key(1), bh(1))], &[0; 32], &sk, &pk_bytes).unwrap(); + let h1 = c1.hash(); + state.rotate(c1); + state.mark_gossiped(h1); + + // Several ungossiped rebuilds. + for i in 2..=6u8 { + let c = + BuiltCommitment::build(vec![(key(i), bh(i))], &[0; 32], &sk, &pk_bytes).unwrap(); + state.rotate(c); + } + // h1 was gossiped and is still within the last-2-gossiped window + // (nothing else was gossiped), so it must still be answerable. + assert!( + state.lookup_by_hash(&h1).is_some(), + "gossiped commitment must survive ungossiped rebuilds" + ); + } +} diff --git a/src/replication/config.rs b/src/replication/config.rs index 1ca8b3db..f47b5cbf 100644 --- a/src/replication/config.rs +++ b/src/replication/config.rs @@ -14,7 +14,7 @@ use std::time::Duration; use rand::Rng; -use crate::ant_protocol::CLOSE_GROUP_SIZE; +use crate::ant_protocol::{CLOSE_GROUP_SIZE, MAX_CHUNK_SIZE}; // --------------------------------------------------------------------------- // Static constants (compile-time reference profile) @@ -104,10 +104,57 @@ pub const AUDIT_TICK_INTERVAL_MIN: Duration = Duration::from_secs(AUDIT_TICK_INT /// Audit scheduler cadence range (max). pub const AUDIT_TICK_INTERVAL_MAX: Duration = Duration::from_secs(AUDIT_TICK_INTERVAL_MAX_SECS); -/// Base audit response deadline (independent of challenge size). -const AUDIT_RESPONSE_BASE_SECS: u64 = 10; -/// Per-key allowance added to the base audit response deadline. -const AUDIT_RESPONSE_PER_KEY_MS: u64 = 20; +/// Floor on the audit response deadline (independent of challenge size). +/// +/// Sized to absorb worst-case global RTT for the audit envelope +/// (the request + response messages are KB-scale, not chunk-scale) +/// plus scheduling jitter. Tokyo↔NY round-trip is ~150ms each way, +/// so 2 seconds comfortably covers cross-continent communication +/// for any audit. +const AUDIT_RESPONSE_FLOOR_SECS: u64 = 2; + +/// Conservative honest-responder read throughput, in bytes per second. +/// +/// Used to size the audit response deadline. An honest peer answers +/// a k-key challenge by reading k chunks from local disk, computing +/// BLAKE3 + path proofs, and signing the response. The bottleneck is +/// disk read; BLAKE3 at ~3 GB/s + ML-DSA signing at ~3 ms are +/// negligible. +/// +/// Set conservatively below any modern SSD (typical: 500 MB/s+). +/// At 50 MB/s, a k=10 sample at 4 MiB chunks reads in ~0.8s, well +/// inside even an aggressive timeout. A relay attacker who must +/// fetch the same 40 MB over the network at typical bandwidth +/// (100 Mbps = 12.5 MB/s) takes 3+ seconds for the data alone, plus +/// per-chunk network round-trips. At larger sample sizes the gap +/// is exponential in the relay's disadvantage. +const AUDIT_HONEST_READ_BPS: u64 = 50 * 1024 * 1024; + +/// Slack multiplier on the honest-read estimate. +/// +/// Set so an honest peer that's slower than `HONEST_READ_BPS` (e.g. an +/// HDD-backed node, or one under load) still answers within the +/// timeout. 5× is generous; a relay peer fetching the same data over a +/// residential link (~5-12 MB/s) sees ~10-100× higher latency than disk +/// and misses the budget. This is an economic deterrent calibrated for +/// residential bandwidth, NOT a hard cryptographic bound — a relay on a +/// datacenter cross-connect could still fetch fast enough to answer in +/// time (see the §7 note on `audit_response_timeout`). +const AUDIT_RESPONSE_HONEST_MULTIPLIER: u64 = 5; + +/// Single-key prune audit response deadline. +/// +/// Prune audits ask a peer whether they still hold one specific key +/// they previously claimed. The relay-defence rationale that motivates +/// the tight commitment-bound timeout does NOT apply here: the +/// auditor's own out-of-range hysteresis (`PRUNE_HYSTERESIS_DURATION`, +/// 3 days) already makes "fetch on demand" infeasible as a sustained +/// strategy. +/// +/// Sized to comfortably accommodate cold cross-continent QUIC +/// handshake plus scheduling jitter on a busy honest peer answering +/// a single-key challenge: 10 s. +const PRUNE_AUDIT_RESPONSE_SECS: u64 = 10; /// Maximum duration a peer may claim bootstrap status before penalties apply. const BOOTSTRAP_CLAIM_GRACE_PERIOD_SECS: u64 = 24 * 60 * 60; // 24 h @@ -121,13 +168,54 @@ const PRUNE_HYSTERESIS_DURATION_SECS: u64 = 3 * 24 * 60 * 60; // 3 days pub const PRUNE_HYSTERESIS_DURATION: Duration = Duration::from_secs(PRUNE_HYSTERESIS_DURATION_SECS); /// Protocol identifier for replication operations. -pub const REPLICATION_PROTOCOL_ID: &str = "autonomi.ant.replication.v1"; +/// +/// Bumped to `v2` for the v12 storage-bound audit. That change extends the +/// wire types (`NeighborSyncRequest`/`Response` carry an optional trailing +/// `StorageCommitment`, and the gossip-triggered storage-commitment audit adds +/// the `SubtreeAuditChallenge`/`SubtreeAuditResponse` and `SubtreeByteChallenge`/ +/// `SubtreeByteResponse` messages). The bump is for SEMANTIC interop, not +/// decode failure: postcard tolerates the appended optional field (an old +/// decoder reads the fields it knows and ignores the trailer — pinned by the +/// `old_decoder_tolerates_new_neighbor_sync_*` tests in `protocol.rs`), but +/// tolerating bytes is not interoperating. A v1 node cannot decode the NEW +/// message variants at all (unknown enum discriminant) and never acts on a +/// piggybacked commitment, so mixed-version replication would half-function — +/// audit challenges unanswered, commitments silently dropped — and a v2 node +/// could read that silence as misbehaviour. Rather than reason about each +/// such case, we route v12 replication on a distinct protocol id: a node only +/// delivers messages whose topic matches its own id (see the topic check in +/// `mod.rs`), so v1 and v2 nodes simply do not exchange replication traffic +/// during a mixed-version window. This is the rollout-safe behaviour: no +/// half-interpreted exchange, no spurious eviction. Replication between +/// matched-version peers is unaffected. (DHT routing/lookups are a separate +/// protocol and continue to span both versions.) +pub const REPLICATION_PROTOCOL_ID: &str = "autonomi.ant.replication.v2"; /// 10 MiB — maximum replication wire message size (accommodates hint batches). const REPLICATION_MESSAGE_SIZE_MIB: usize = 10; /// Maximum replication wire message size. pub const MAX_REPLICATION_MESSAGE_SIZE: usize = REPLICATION_MESSAGE_SIZE_MIB * 1024 * 1024; +/// Headroom reserved for the envelope (enum tags, ids, length prefixes) when +/// sizing a round-2 byte-challenge batch against the wire cap. +const BYTE_CHALLENGE_RESPONSE_HEADROOM: usize = 64 * 1024; + +/// Maximum keys per round-2 [`SubtreeByteChallenge`] (per-batch cap). +/// +/// Sized so the WORST-CASE response (every requested chunk at +/// `MAX_CHUNK_SIZE`) still encodes under [`MAX_REPLICATION_MESSAGE_SIZE`]. +/// The auditor splits its spot-check sample into batches of this size (one +/// challenge per batch, same nonce/pin); the responder rejects any single +/// challenge requesting more. +/// +/// [`SubtreeByteChallenge`]: crate::replication::protocol::SubtreeByteChallenge +pub const MAX_BYTE_CHALLENGE_KEYS: usize = + (MAX_REPLICATION_MESSAGE_SIZE - BYTE_CHALLENGE_RESPONSE_HEADROOM) / MAX_CHUNK_SIZE; +const _: () = assert!( + MAX_BYTE_CHALLENGE_KEYS >= 1, + "wire cap must fit at least one max-size chunk per byte-challenge response" +); + /// Verification request timeout (per-batch). const VERIFICATION_REQUEST_TIMEOUT_SECS: u64 = 15; /// Verification request timeout (per-batch). @@ -147,6 +235,52 @@ pub const PENDING_VERIFY_MAX_AGE: Duration = Duration::from_secs(PENDING_VERIFY_ /// Trust event weight for confirmed audit failures. pub const AUDIT_FAILURE_TRUST_WEIGHT: f64 = 5.0; +/// Consecutive audit *timeouts* a peer may accumulate before a timeout is +/// reported as an `ApplicationFailure` trust event. +/// +/// The audit response timeout is an economic deterrent calibrated for +/// residential bandwidth, not a hard cryptographic bound: a single slow +/// response is routine for an honest node under transient load (GC pause, +/// disk flush, a burst of concurrent requests). Penalizing on the first +/// timeout false-positives those nodes. +/// +/// Requiring `N` *consecutive* timeouts before penalizing removes that +/// false-positive while preserving the deterrent against a peer that does not +/// actually store the data and must fetch it at audit time: such a peer is +/// slow on *every* audit and accumulates a fresh strike each tick until it +/// crosses the threshold, whereas an honest node answers normally between rare +/// slow ticks and any success resets its strike counter to zero (see +/// `handle_audit_result`). The discriminator is *persistence* of slowness +/// versus *transience*. This deliberately does not widen the per-challenge +/// window. Applies ONLY to `AuditFailureReason::Timeout`; confirmed +/// storage-integrity failures (`DigestMismatch` / `KeyAbsent` / `Rejected` / +/// `MalformedResponse`) remain instantly punishable. +pub const AUDIT_TIMEOUT_STRIKE_THRESHOLD: u32 = 3; + +/// Probability of launching a subtree audit when a peer's *changed* commitment +/// is ingested via gossip (ADR-0002). Keeps audits occasional surprise exams. +pub const AUDIT_ON_GOSSIP_PROBABILITY: f64 = 0.2; + +/// Per-peer cooldown between gossip-triggered subtree audits (ADR-0002), in +/// seconds. Bounds how often any one peer is audited regardless of gossip rate. +pub const AUDIT_ON_GOSSIP_COOLDOWN_SECS: u64 = 30 * 60; + +/// Number of subtree leaves spot-checked against real chunk bytes per audit +/// (ADR-0002 real-bytes layer). +/// +/// The auditor clamps this to its 3..=5 band (`BYTE_SPOTCHECK_MIN..=MAX` in +/// `storage_commitment_audit`), so this is the effective MAXIMUM — set it +/// within the band rather than advertising a sample size the auditor never +/// requests. +pub const AUDIT_SPOTCHECK_COUNT: u32 = 5; + +/// Conservative leaf-count hint for sizing the subtree-audit response deadline. +/// +/// The deadline is set before the proof arrives, so we size for the largest +/// legal store: `sqrt(MAX_COMMITMENT_KEY_COUNT) = 1000`. Honest small stores +/// finish well within it. +pub const SUBTREE_AUDIT_TIMEOUT_LEAF_HINT: usize = 1000; + /// Maximum number of prune-confirmation audit challenges sent per prune pass. pub const MAX_PRUNE_AUDIT_CHALLENGES_PER_PASS: usize = 64; @@ -187,10 +321,24 @@ pub struct ReplicationConfig { pub audit_tick_interval_min: Duration, /// Audit scheduler cadence range (max). pub audit_tick_interval_max: Duration, - /// Base audit response deadline (key-independent component). - pub audit_response_base: Duration, - /// Per-key allowance added to the base audit response deadline. - pub audit_response_per_key: Duration, + /// Floor on the audit response deadline. Covers global RTT for + /// the small request/response envelope plus scheduling jitter. + /// See `AUDIT_RESPONSE_FLOOR_SECS` for sizing. + pub audit_response_floor: Duration, + /// Conservative honest-responder read throughput (bytes/sec). + /// Used to scale the audit response deadline against the size of + /// the challenge. Slow enough that even an HDD-backed honest peer + /// fits inside the budget; fast enough that a relay attacker who + /// must fetch bytes over the network falls outside. + pub audit_honest_read_bps: u64, + /// Slack multiplier on the honest-read estimate before + /// declaring an audit timed out. + pub audit_response_honest_multiplier: u64, + /// Single-key prune-audit response deadline. Has its own constant + /// because the relay-defence rationale that motivates the tight + /// commitment-bound budget does not apply to a single-key prune + /// challenge. + pub prune_audit_response_timeout: Duration, /// Maximum duration a peer may claim bootstrap status. pub bootstrap_claim_grace_period: Duration, /// Minimum continuous out-of-range duration before pruning a key. @@ -219,8 +367,10 @@ impl Default for ReplicationConfig { self_lookup_interval_max: SELF_LOOKUP_INTERVAL_MAX, audit_tick_interval_min: AUDIT_TICK_INTERVAL_MIN, audit_tick_interval_max: AUDIT_TICK_INTERVAL_MAX, - audit_response_base: Duration::from_secs(AUDIT_RESPONSE_BASE_SECS), - audit_response_per_key: Duration::from_millis(AUDIT_RESPONSE_PER_KEY_MS), + audit_response_floor: Duration::from_secs(AUDIT_RESPONSE_FLOOR_SECS), + audit_honest_read_bps: AUDIT_HONEST_READ_BPS, + audit_response_honest_multiplier: AUDIT_RESPONSE_HONEST_MULTIPLIER, + prune_audit_response_timeout: Duration::from_secs(PRUNE_AUDIT_RESPONSE_SECS), bootstrap_claim_grace_period: BOOTSTRAP_CLAIM_GRACE_PERIOD, prune_hysteresis_duration: PRUNE_HYSTERESIS_DURATION, verification_request_timeout: VERIFICATION_REQUEST_TIMEOUT, @@ -343,11 +493,72 @@ impl ReplicationConfig { } /// Compute the audit response timeout for a challenge with - /// `challenged_key_count` keys: `base + per_key * challenged_key_count`. + /// `challenged_key_count` keys, **sized to be tight enough that a + /// relay attacker that must fetch the chunk bytes from elsewhere + /// falls outside the budget**. + /// + /// Formula: + /// `floor + (challenged_bytes / honest_read_bps) × multiplier` + /// + /// Where `challenged_bytes = k × MAX_CHUNK_SIZE`. An honest peer + /// reads `k × 4 MiB` from local disk at `honest_read_bps` (set + /// conservatively at 50 MB/s — well below modern SSDs); the + /// multiplier of 5 absorbs jitter, BLAKE3, ML-DSA, and slow disks. + /// + /// A relay attacker on a residential link (~5-12 MB/s) who must + /// fetch the same `k × 4 MiB` over the network sees ~10-100× higher + /// latency than disk for the data alone, plus per-chunk round-trips, + /// and misses the budget — recording a timeout strike (per + /// `handle_audit_timeout` → `handle_audit_failure`). After + /// [`AUDIT_TIMEOUT_STRIKE_THRESHOLD`] consecutive timeouts this would + /// fire an `application_failure` trust event — but note that report is + /// currently suppressed for the breaking rollout (grep + /// TIMEOUT-EVICTION-DISABLED); the strike accounting still runs. + /// + /// This is an economic deterrent for the §7 relay limit calibrated + /// for residential bandwidth, NOT a hard bound: a relay on a + /// datacenter cross-connect (≥1 Gbps) can fetch `k × 4 MiB` fast + /// enough to answer in time. It raises the relay's cost (bandwidth + /// per audit) without claiming to make relaying impossible. The + /// cryptographic guarantee remains commitment-binding (the relay + /// must still hold or fetch the exact committed bytes); the timeout + /// only attacks the economics. #[must_use] pub fn audit_response_timeout(&self, challenged_key_count: usize) -> Duration { - let keys = u32::try_from(challenged_key_count).unwrap_or(u32::MAX); - self.audit_response_base + self.audit_response_per_key * keys + let bytes_per_key = u64::try_from(crate::ant_protocol::MAX_CHUNK_SIZE).unwrap_or(u64::MAX); + let keys = u64::try_from(challenged_key_count).unwrap_or(u64::MAX); + let total_bytes = bytes_per_key.saturating_mul(keys); + let bps = self.audit_honest_read_bps.max(1); + // Apply the multiplier BEFORE integer-dividing by bps so each + // chunk contributes a fractional second rather than rounding + // down to zero. Otherwise k in 1..=12 would all collapse to the + // floor (~40 MiB / 50 MB/s = 0 secs in integer arithmetic), and + // an honest HDD-backed peer at sqrt(N)=10 stored chunks could + // miss the budget under load. + let multiplied = total_bytes.saturating_mul(self.audit_response_honest_multiplier); + let scaled_secs = multiplied / bps; + // saturating_add avoids a panic if `scaled_secs` (or the floor + // plus it) would overflow `Duration::MAX`. + self.audit_response_floor + .saturating_add(Duration::from_secs(scaled_secs)) + } + + /// Number of subtree leaves to spot-check against real chunk bytes per + /// audit (ADR-0002 real-bytes layer). Faking a fraction `x` of nonced + /// leaves survives only `(1 - x)^k`. + #[must_use] + pub fn audit_spotcheck_count(&self) -> u32 { + AUDIT_SPOTCHECK_COUNT + } + + /// Conservative leaf-count hint for sizing the subtree-audit response + /// deadline before the proof arrives. + /// + /// The selected subtree holds about `sqrt(key_count)` real leaves; sizing + /// for a large store keeps an honest peer with a big store from timing out. + #[must_use] + pub fn subtree_audit_timeout_leaf_hint(&self) -> usize { + SUBTREE_AUDIT_TIMEOUT_LEAF_HINT } /// Returns a random duration in `[audit_tick_interval_min, @@ -409,6 +620,109 @@ mod tests { assert!((AUDIT_FAILURE_TRUST_WEIGHT - 5.0).abs() <= f64::EPSILON); } + #[test] + fn audit_timeout_strike_threshold_is_three() { + // Smallest threshold that tolerates back-to-back transient slowness + // while still penalizing a persistently-slow non-storing peer within a + // few audit ticks. + assert_eq!(AUDIT_TIMEOUT_STRIKE_THRESHOLD, 3); + } + + #[test] + fn replication_protocol_id_is_v2() { + // The v12 storage-bound audit changes replication SEMANTICS. The + // protocol id MUST advance past v1 so v1 and v2 nodes never exchange + // replication traffic they can only half-interpret (rollout safety — + // see the const's doc). If this regresses to v1, mixed-version nodes + // would talk past each other and risk spurious penalties. + assert_eq!(REPLICATION_PROTOCOL_ID, "autonomi.ant.replication.v2"); + } + + #[test] + fn audit_response_timeout_floor_at_zero_keys() { + let config = ReplicationConfig::default(); + assert_eq!( + config.audit_response_timeout(0), + Duration::from_secs(AUDIT_RESPONSE_FLOOR_SECS), + "zero-key challenge should yield the floor exactly" + ); + } + + #[test] + fn audit_response_timeout_scales_with_key_count() { + let config = ReplicationConfig::default(); + let t1 = config.audit_response_timeout(1); + let t10 = config.audit_response_timeout(10); + let t100 = config.audit_response_timeout(100); + assert!(t1 <= t10 && t10 < t100, "timeout must not decrease with k"); + + // Multiplier is applied before the divide so each chunk + // contributes ~0.4 s rather than rounding to 0 at small k. + // For k=1: (4_194_304 × 5) / 52_428_800 = 0 (still below 1 s), + // + 2 s floor = 2 s. + assert_eq!(t1, Duration::from_secs(2)); + + // For k=10: (10 × 4_194_304 × 5) / 52_428_800 = 4 s scaled, + // + 2 s floor = 6 s. An HDD-backed honest peer at 20 MB/s reads + // 40 MiB in ~2 s, comfortably inside the budget; a relay + // attacker fetching the same 40 MiB at 5 MB/s residential + // bandwidth needs ~8 s for the data alone, outside. + assert_eq!(t10, Duration::from_secs(6)); + + // For k=100: (100 × 4_194_304 × 5) / 52_428_800 = 40 s scaled, + // + 2 s floor = 42 s. + assert_eq!(t100, Duration::from_secs(42)); + } + + #[test] + fn audit_response_timeout_fits_honest_hdd_at_typical_sample_size() { + // The canonical audit sample is sqrt(N) at N stored chunks. + // At N=100 stored chunks, sample is 10. An HDD-backed honest + // peer at the slowest realistic random-read throughput (20 MB/s, + // well below modern HDDs which sustain 80-150 MB/s sequential) + // reads 10 × 4 MiB = 40 MiB in ~2 s. Add 300 ms cross-continent + // RTT, ~10 ms scheduling, ~3 ms ML-DSA sign, and the honest + // envelope is ~2.3 s. The 6 s budget at k=10 leaves >3 s of + // slack. + let config = ReplicationConfig::default(); + let budget = config.audit_response_timeout(10); + let realistic_hdd_bps: u64 = 20 * 1024 * 1024; + let bytes: u64 = 10 * 4 * 1024 * 1024; + let honest_envelope_secs = bytes / realistic_hdd_bps + 1; // +1 s for network/scheduling/sign + assert!( + Duration::from_secs(honest_envelope_secs) < budget, + "honest HDD envelope ({honest_envelope_secs}s) must fit inside k=10 budget ({}s)", + budget.as_secs(), + ); + } + + #[test] + fn audit_response_timeout_relay_is_outside_envelope() { + // The intended invariant: an honest peer with the SSD-class + // read budget fits inside `audit_response_timeout(k)`, while a + // relay attacker fetching k*4MiB over residential bandwidth + // (≈ 5 MB/s realistic for sustained download) does NOT. Spot- + // check this at k=100: honest budget is 42s, relay needs at + // least 100 * 4 MiB / 5 MB/s = 80s for the data alone, which + // exceeds the budget. + let config = ReplicationConfig::default(); + let budget = config.audit_response_timeout(100); + let relay_data_only = Duration::from_secs(100 * 4 * 1024 * 1024 / (5 * 1024 * 1024)); + assert!( + relay_data_only > budget, + "relay fetch ({}s) must exceed honest audit budget ({}s)", + relay_data_only.as_secs(), + budget.as_secs(), + ); + } + + #[test] + fn audit_response_timeout_saturates_on_huge_k() { + let config = ReplicationConfig::default(); + // Should not panic or overflow at extreme k values. + let _ = config.audit_response_timeout(usize::MAX); + } + #[test] fn quorum_threshold_zero_rejected() { let config = ReplicationConfig { diff --git a/src/replication/mod.rs b/src/replication/mod.rs index 81ea8ec5..21590dd1 100644 --- a/src/replication/mod.rs +++ b/src/replication/mod.rs @@ -17,6 +17,8 @@ pub mod admission; pub mod audit; pub mod bootstrap; +pub mod commitment; +pub mod commitment_state; pub mod config; pub mod fresh; pub mod neighbor_sync; @@ -24,7 +26,10 @@ pub mod paid_list; pub mod protocol; pub mod pruning; pub mod quorum; +pub mod recent_provers; pub mod scheduling; +pub mod storage_commitment_audit; +pub mod subtree; pub mod types; use std::collections::{HashMap, HashSet}; @@ -46,6 +51,8 @@ use crate::ant_protocol::XorName; use crate::error::{Error, Result}; use crate::payment::{PaymentVerifier, VerificationContext}; use crate::replication::audit::AuditTickResult; +use crate::replication::commitment::{commitment_hash, StorageCommitment}; +use crate::replication::commitment_state::{PeerCommitmentRecord, ResponderCommitmentState}; use crate::replication::config::{ max_parallel_fetch, ReplicationConfig, MAX_CONCURRENT_REPLICATION_SENDS, REPLICATION_PROTOCOL_ID, @@ -56,13 +63,14 @@ use crate::replication::protocol::{ VerificationResponse, }; use crate::replication::quorum::KeyVerificationOutcome; +use crate::replication::recent_provers::RecentProvers; use crate::replication::scheduling::ReplicationQueues; use crate::replication::types::{ AuditFailureReason, BootstrapClaimObservation, BootstrapState, FailureEvidence, HintPipeline, NeighborSyncState, PeerSyncRecord, RepairProofs, VerificationEntry, VerificationState, }; use crate::storage::LmdbStorage; -use saorsa_core::identity::PeerId; +use saorsa_core::identity::{NodeIdentity, PeerId}; use saorsa_core::{DhtNetworkEvent, P2PEvent, P2PNode, TrustEvent}; // --------------------------------------------------------------------------- @@ -85,6 +93,13 @@ struct VerificationCycleContext<'a> { bootstrap_state: &'a Arc>, is_bootstrapping: &'a Arc>, bootstrap_complete_notify: &'a Arc, + /// v12 §6 holder-eligibility inputs. The verifier downgrades a + /// peer's Present claim to Unresolved unless they're a credited + /// holder of the key (i.e. they recently passed a commitment-bound + /// audit on it under their currently-credited commitment hash). + last_commitment_by_peer: &'a Arc>>, + ever_capable_peers: &'a Arc>>, + recent_provers: &'a Arc>, } /// Fetch worker polling interval in milliseconds. @@ -93,9 +108,6 @@ const FETCH_WORKER_POLL_MS: u64 = 100; /// Verification worker polling interval in milliseconds. const VERIFICATION_WORKER_POLL_MS: u64 = 250; -/// Bootstrap drain check interval in seconds. -const BOOTSTRAP_DRAIN_CHECK_SECS: u64 = 5; - /// Standard trust event weight for per-operation success/failure signals. /// /// Used for individual replication fetch outcomes, integrity check failures, @@ -103,6 +115,69 @@ const BOOTSTRAP_DRAIN_CHECK_SECS: u64 = 5; /// is reserved for confirmed audit failures. const REPLICATION_TRUST_WEIGHT: f64 = 1.0; +/// Bootstrap drain check interval in seconds. +const BOOTSTRAP_DRAIN_CHECK_SECS: u64 = 5; + +/// How often the responder rebuilds + rotates its storage commitment. +/// +/// Each rebuild scans LMDB to compute leaf hashes; for ~10k keys this is +/// sub-100ms (BLAKE3 + tree build). Retention is gossip-anchored, NOT +/// rotation-anchored: the responder stays answerable for the current +/// commitment plus the last `RETAINED_GOSSIPED_COMMITMENTS` (= 2) it +/// actually gossiped, each kept for `GOSSIP_ANSWERABILITY_TTL` (3 h) after +/// its last emission (see `commitment_state`). So the rotation cadence does +/// not by itself bound answerability — a gossiped commitment stays +/// answerable across rotations until its gossip TTL lapses. +/// +/// Default: 1 hour, aligned with the worst-case neighbor-sync cooldown +/// (`NEIGHBOR_SYNC_COOLDOWN_SECS = 3600`). Because the gossip TTL (3 h) +/// comfortably exceeds the gap between our rotation and the next gossip +/// arrival at a remote peer, this prevents the "unknown commitment hash" -> +/// Idle audit-skip pattern from being the common case. +/// +/// Why not faster: the v12 pin is bound to a specific point-in-time +/// commitment, so rotation isn't security-critical for pin freshness — +/// only for keeping the committed key set current as the responder +/// writes new keys. 1 hour is plenty for that, and slow enough that +/// honest auditors mostly hit `current` or `previous` rather than the +/// "rotated past" case. +const COMMITMENT_ROTATION_INTERVAL_SECS: u64 = 3600; + +/// Minimum interval between commitment signature verifications for a +/// single peer (v10/v12 §2 step 3 + §11 `DoS`). +/// +/// A sybil that bypasses the routing-table gate (e.g. by transient +/// bucket pollution) could otherwise force one ML-DSA-65 verify (~1 ms) +/// per gossip message. This rate limit caps the verify-per-peer rate +/// at 1/min, which is comfortably above the legitimate gossip cadence +/// (the 10-20 min neighbor-sync round on each peer). +const COMMITMENT_SIG_VERIFY_MIN_INTERVAL: Duration = Duration::from_secs(60); + +/// Hard cap on the size of `last_commitment_by_peer`. +/// +/// Bounds the per-process memory cost of the auditor's per-peer +/// commitment cache. Each entry holds a `StorageCommitment` +/// (~5 KiB: 1952-byte pubkey + 3293-byte signature + small fields). +/// At 4096 entries the cache is ~20 MiB, which comfortably covers a +/// realistic close-group neighborhood. When the cap is hit, one +/// arbitrary existing entry is evicted on insert (`HashMap` iteration +/// order is unspecified; we do not track insertion order). The +/// `PeerRemoved` handler proactively drops entries as the DHT +/// detects departures, and `ingest_peer_commitment` only admits +/// commitments from peers currently in the routing table — together +/// the cap is the third line of defence against sybil/churn flooding. +const MAX_LAST_COMMITMENT_BY_PEER: usize = 4096; + +/// Cap on the sticky `ever_capable_peers` set. Bounds memory so a +/// long-running bootstrap node cannot have the set grow without limit +/// from peer-id churn. Sized at 4x `MAX_LAST_COMMITMENT_BY_PEER` so +/// the set comfortably outlives normal LRU churn but still caps the +/// blast radius of identity-rotation attacks. Once full we refuse new +/// inserts (no eviction) — keeps the historic set stable; new v12 +/// peers above the cap are treated as legacy on rejoin, which matches +/// the behaviour before this set existed, not a security regression. +const MAX_EVER_CAPABLE_PEERS: usize = 4 * MAX_LAST_COMMITMENT_BY_PEER; + // --------------------------------------------------------------------------- // ReplicationEngine // --------------------------------------------------------------------------- @@ -129,6 +204,24 @@ pub struct ReplicationEngine { /// are lightweight (`PeerSyncRecord` is two fields) and peer IDs are /// naturally bounded by the routing table's k-bucket capacity. sync_history: Arc>>, + /// Per-peer consecutive audit-timeout strike counter. + /// + /// A timeout increments the peer's strike count; a successful audit + /// response resets it to zero. Only when a peer reaches + /// [`config::AUDIT_TIMEOUT_STRIKE_THRESHOLD`] consecutive timeouts is a + /// timeout reported as an `ApplicationFailure` trust event. This separates + /// honest transient slowness (resets on the next normal response) from a + /// peer that does not store the data and is slow on every audit. Lives + /// outside `NeighborSyncState` so it is never wiped by a neighbor-sync + /// cycle reset. Grows with peer churn like `sync_history`; entries are a + /// single `u32` and peer IDs are bounded by k-bucket capacity. + audit_timeout_strikes: Arc>>, + /// Per-peer cooldown for gossip-triggered subtree audits (ADR-0002). + /// + /// Records when each peer was last audited so a burst of gossiped + /// commitment changes cannot spawn back-to-back audits of the same peer. + /// Bounded by routing-table membership and cleaned on `PeerRemoved`. + audit_on_gossip_cooldown: Arc>>, /// Completed local neighbor-sync cycle epoch for proof maturity. sync_cycle_epoch: Arc>, /// Per-key repair proof tracking for audit eligibility. @@ -141,6 +234,50 @@ pub struct ReplicationEngine { sync_trigger: Arc, /// Notified when `is_bootstrapping` transitions from `true` to `false`. bootstrap_complete_notify: Arc, + /// Node identity (for signing storage commitments). + /// + /// Phase 3 of the v12 storage-bound audit design. The responder + /// uses this to sign its periodically-built `StorageCommitment`. + identity: Arc, + /// Responder-side commitment state (two-slot atomic rotation). + /// + /// Periodically rebuilt from the live LMDB key set; gossiped on + /// outbound `NeighborSyncRequest`/`Response`; consulted by the + /// commitment-bound audit handler. + commitment_state: Arc, + /// Auditor-side per-peer commitment record (last known commitment + + /// sticky `commitment_capable` flag). + /// + /// Populated whenever an inbound gossip carries a verified + /// commitment from the sender. Used by `audit_tick` to snapshot + /// `expected_commitment_hash` into outbound challenges, and by + /// holder-eligibility (§6) to decide whether a peer's `recent_provers` + /// proof should be honoured. The sticky `commitment_capable` flag + /// flips true on first successful ingest and never reverts (§2 + /// step 5). + last_commitment_by_peer: Arc>>, + /// Sticky set of peer IDs we have EVER seen carrying a v12 + /// commitment, independent of whether their commitment bytes are + /// still in `last_commitment_by_peer`. The §6 holder-eligibility + /// closure consults this set to keep treating churned-out + /// previously-v12 peers as v12-capable (rather than degrading them + /// to "legacy" credit-unconditionally) when they re-appear on the + /// network before their next gossip arrives. Bounded growth: even + /// at one million peers seen over the node's lifetime, the set is + /// 32 MB. + ever_capable_peers: Arc>>, + /// Auditor-side holder-eligibility cache (v12 §6). + /// + /// Recorded on successful commitment-bound audit; read by future + /// quorum / paid-list eligibility checks (phase-3 stretch). + recent_provers: Arc>, + /// Per-peer last sig-verify attempt timestamp for the §2 step 3 / + /// §11 `DoS` rate limit. Bumped on EVERY verify attempt (success or + /// failure) so a peer we've never successfully verified can't burn + /// CPU on a flood of structurally-plausible-but-invalid gossips. + /// Lives separately from `last_commitment_by_peer` because that + /// map's records only exist after a successful verify. + sig_verify_attempts: Arc>>, /// Limits concurrent outbound replication sends to prevent bandwidth /// saturation on home broadband connections. send_semaphore: Arc, @@ -162,11 +299,13 @@ impl ReplicationEngine { /// /// Returns an error if the `PaidList` LMDB environment cannot be opened /// or if the configuration fails validation. + #[allow(clippy::too_many_arguments)] pub async fn new( config: ReplicationConfig, p2p_node: Arc, storage: Arc, payment_verifier: Arc, + identity: Arc, root_dir: &Path, fresh_write_rx: mpsc::UnboundedReceiver, shutdown: CancellationToken, @@ -191,12 +330,20 @@ impl ReplicationEngine { queues: Arc::new(RwLock::new(ReplicationQueues::new())), sync_state: Arc::new(RwLock::new(initial_neighbors)), sync_history: Arc::new(RwLock::new(HashMap::new())), + audit_timeout_strikes: Arc::new(RwLock::new(HashMap::new())), + audit_on_gossip_cooldown: Arc::new(RwLock::new(HashMap::new())), sync_cycle_epoch: Arc::new(RwLock::new(0)), repair_proofs: Arc::new(RwLock::new(RepairProofs::new())), bootstrap_state: Arc::new(RwLock::new(BootstrapState::new())), is_bootstrapping: Arc::new(RwLock::new(true)), sync_trigger: Arc::new(Notify::new()), bootstrap_complete_notify: Arc::new(Notify::new()), + identity, + commitment_state: Arc::new(ResponderCommitmentState::new()), + last_commitment_by_peer: Arc::new(RwLock::new(HashMap::new())), + ever_capable_peers: Arc::new(RwLock::new(HashSet::new())), + recent_provers: Arc::new(RwLock::new(RecentProvers::new())), + sig_verify_attempts: Arc::new(RwLock::new(HashMap::new())), send_semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT_REPLICATION_SENDS)), fresh_write_rx: Some(fresh_write_rx), shutdown, @@ -210,6 +357,97 @@ impl ReplicationEngine { &self.paid_list } + /// Get a reference to the responder's commitment state. Used by audit + /// handlers to look up commitments by hash; used by the rotation tick + /// to install fresh ones. + #[must_use] + pub fn commitment_state(&self) -> &Arc { + &self.commitment_state + } + + /// Get a reference to the auditor's last-commitment-by-peer table. + #[must_use] + pub fn last_commitment_by_peer(&self) -> &Arc>> { + &self.last_commitment_by_peer + } + + /// Get a reference to the holder-eligibility cache. Phase-3 stretch: + /// will be read by quorum / paid-list eligibility checks. + #[must_use] + pub fn recent_provers(&self) -> &Arc> { + &self.recent_provers + } + + /// Test-only: rebuild + rotate this node's storage commitment now over its + /// current key set (normally on a 1h timer). Lets a test commit to chunks it + /// just stored without waiting for the rotation cadence. + /// + /// # Errors + /// + /// Propagates any error from reading the local key set or building/signing + /// the commitment. + #[cfg(any(test, feature = "test-utils"))] + pub async fn rebuild_commitment_now(&self) -> Result<()> { + rebuild_and_rotate_commitment( + &self.storage, + &self.identity, + &self.commitment_state, + &self.p2p_node, + &self.config, + ) + .await + } + + /// Test-only: directly seed this node's cached commitment for `peer`, + /// simulating "we received `peer`'s gossiped commitment" without depending + /// on neighbor-sync propagation timing. Lets a two-node audit test pin the + /// peer's commitment deterministically. + #[cfg(any(feature = "test-utils", test))] + pub async fn inject_peer_commitment_for_test( + &self, + peer: &PeerId, + commitment: StorageCommitment, + ) { + let now = Instant::now(); + self.last_commitment_by_peer + .write() + .await + .insert(*peer, PeerCommitmentRecord::from_verified(commitment, now)); + self.ever_capable_peers.write().await.insert(*peer); + } + + /// Test-only: run ONE subtree audit against `peer` right now, pinned to the + /// commitment this node has cached for it (from gossip), over the live wire. + /// Returns the audit outcome so tests can assert honest-pass / adversary-fail + /// in a real two-node setting without waiting for the gossip cadence. + /// + /// Returns `AuditTickResult::Idle` if we have no cached commitment for the + /// peer yet (gossip hasn't reached us). Gated to test builds. + #[cfg(any(test, feature = "test-utils"))] + pub async fn audit_peer_now(&self, peer: &PeerId) -> audit::AuditTickResult { + let target = { + let map = self.last_commitment_by_peer.read().await; + map.get(peer) + .and_then(|r| r.last_commitment.as_ref()) + .and_then(|c| commitment_hash(c).map(|h| (h, c.key_count))) + }; + let Some((pin, key_count)) = target else { + return audit::AuditTickResult::Idle; + }; + let credit = storage_commitment_audit::AuditCredit { + recent_provers: &self.recent_provers, + }; + storage_commitment_audit::run_subtree_audit( + &self.p2p_node, + &self.config, + peer, + pin, + key_count, + Some(&credit), + ) + .await + } + /// Start all background tasks. /// /// `dht_events` must be subscribed **before** `P2PNode::start()` so that @@ -225,7 +463,12 @@ impl ReplicationEngine { self.start_message_handler(); self.start_neighbor_sync_loop(); self.start_self_lookup_loop(); + // Audit #2 (responsible-chunk): periodic tick auditing peers for the + // chunks they SHOULD store (responsibility + prior hint). self.start_audit_loop(); + // Audit #1 (storage-commitment) is gossip-triggered in the message + // handler when a peer's commitment is ingested, not on a periodic tick. + self.start_commitment_rotation_loop(); self.start_fetch_worker(); self.start_verification_worker(); self.start_bootstrap_sync(dht_events); @@ -367,6 +610,25 @@ impl ReplicationEngine { let sync_cycle_epoch = Arc::clone(&self.sync_cycle_epoch); let repair_proofs = Arc::clone(&self.repair_proofs); let sync_trigger = Arc::clone(&self.sync_trigger); + let my_commitment_state = Arc::clone(&self.commitment_state); + let last_commitment_by_peer = Arc::clone(&self.last_commitment_by_peer); + let ever_capable_peers = Arc::clone(&self.ever_capable_peers); + let recent_provers = Arc::clone(&self.recent_provers); + let sig_verify_attempts = Arc::clone(&self.sig_verify_attempts); + let audit_timeout_strikes = Arc::clone(&self.audit_timeout_strikes); + let audit_on_gossip_cooldown = Arc::clone(&self.audit_on_gossip_cooldown); + let sync_state = Arc::clone(&self.sync_state); + + // ADR-0002 gossip-audit trigger: bundled state so an ingested *changed* + // commitment can spawn a probabilistic, cooldown-gated subtree audit. + let gossip_audit = GossipAuditTrigger { + p2p_node: Arc::clone(&p2p), + config: Arc::clone(&config), + recent_provers: Arc::clone(&recent_provers), + sync_state: Arc::clone(&sync_state), + audit_timeout_strikes: Arc::clone(&audit_timeout_strikes), + cooldown: Arc::clone(&audit_on_gossip_cooldown), + }; let handle = tokio::spawn(async move { loop { @@ -409,6 +671,11 @@ impl ReplicationEngine { &sync_history, &sync_cycle_epoch, &repair_proofs, + &last_commitment_by_peer, + &ever_capable_peers, + &sig_verify_attempts, + &my_commitment_state, + &gossip_audit, rr_message_id.as_deref(), ).await { Ok(()) => {} @@ -439,6 +706,27 @@ impl ReplicationEngine { } DhtNetworkEvent::PeerRemoved { peer_id } => { repair_proofs.write().await.remove_peer(&peer_id); + // v12: drop the commitment bytes and the + // recent-prover credit so a churn / sybil + // attacker cannot leave behind one + // StorageCommitment per identity in + // `last_commitment_by_peer`. Also drop the + // sig-verify rate-limit timestamp. + last_commitment_by_peer.write().await.remove(&peer_id); + recent_provers.write().await.forget_peer(&peer_id); + sig_verify_attempts.write().await.remove(&peer_id); + // Drop the timeout-strike entry too, so a + // departed peer leaves no residual (keeps this + // map bounded under churn, like its siblings). + audit_timeout_strikes.write().await.remove(&peer_id); + // Same for the gossip-audit cooldown (ADR-0002). + audit_on_gossip_cooldown.write().await.remove(&peer_id); + // The sticky `commitment_capable` flag is + // preserved orthogonally via + // `ever_capable_peers` — even after this + // removal, a re-joining peer continues to + // be treated as v12-capable rather than + // legacy (§3 shield). } _ => {} } @@ -464,6 +752,22 @@ impl ReplicationEngine { let is_bootstrapping = Arc::clone(&self.is_bootstrapping); let bootstrap_state = Arc::clone(&self.bootstrap_state); let sync_trigger = Arc::clone(&self.sync_trigger); + let commitment_state = Arc::clone(&self.commitment_state); + let last_commitment_by_peer = Arc::clone(&self.last_commitment_by_peer); + let ever_capable_peers = Arc::clone(&self.ever_capable_peers); + let sig_verify_attempts = Arc::clone(&self.sig_verify_attempts); + // ADR-0002: a peer's commitment also arrives on the sync RESPONSE path + // (we initiated, they piggybacked theirs). Carry a gossip-audit trigger + // here too so a peer that only ever answers — never initiates sync — + // is still audited; otherwise it could fully evade auditing. + let gossip_audit = GossipAuditTrigger { + p2p_node: Arc::clone(&p2p), + config: Arc::clone(&config), + recent_provers: Arc::clone(&self.recent_provers), + sync_state: Arc::clone(&sync_state), + audit_timeout_strikes: Arc::clone(&self.audit_timeout_strikes), + cooldown: Arc::clone(&self.audit_on_gossip_cooldown), + }; let handle = tokio::spawn(async move { loop { @@ -492,6 +796,11 @@ impl ReplicationEngine { &repair_proofs, &is_bootstrapping, &bootstrap_state, + &commitment_state, + &last_commitment_by_peer, + &ever_capable_peers, + &sig_verify_attempts, + &gossip_audit, ) => {} } } @@ -522,6 +831,12 @@ impl ReplicationEngine { self.task_handles.push(handle); } + /// Periodic responsible-chunk audit loop (audit #2): every + /// [`ReplicationConfig::random_audit_tick_interval`] (~10-20 min), audit one + /// eligible close peer for the chunks it *should* be storing (by + /// responsibility and prior repair hint), independent of the gossip-triggered + /// storage-commitment audit. Waits for bootstrap to drain, then runs one tick + /// immediately and periodically thereafter. fn start_audit_loop(&mut self) { let p2p = Arc::clone(&self.p2p_node); let storage = Arc::clone(&self.storage); @@ -599,6 +914,88 @@ impl ReplicationEngine { self.task_handles.push(handle); } + /// Periodically rebuild + sign + rotate the responder's storage + /// commitment. + /// + /// Phase 3 of the v12 storage-bound audit. Once per + /// [`COMMITMENT_ROTATION_INTERVAL_SECS`], the responder reads the + /// current LMDB key set, builds a Merkle tree (for content-addressed + /// chunks `bytes_hash == key`, so no chunk re-read is needed), signs + /// the root with the node's `MlDsaSecretKey`, and rotates the result + /// into `commitment_state`. Old `previous` slot is dropped by the + /// rotate (per `ResponderCommitmentState::rotate`). + /// + /// Skips if the key set is empty (no commitment to make) — the + /// auditor side falls back to the legacy plain-digest path for + /// peers that have never gossiped a commitment. + fn start_commitment_rotation_loop(&mut self) { + let storage = Arc::clone(&self.storage); + let identity = Arc::clone(&self.identity); + let commitment_state = Arc::clone(&self.commitment_state); + let shutdown = self.shutdown.clone(); + let p2p = Arc::clone(&self.p2p_node); + let config = Arc::clone(&self.config); + let sync_trigger = Arc::clone(&self.sync_trigger); + let recent_provers = Arc::clone(&self.recent_provers); + + let handle = tokio::spawn(async move { + // Build the first commitment immediately on startup so a + // restarted node can answer commitment-bound audits right + // away — otherwise current() stays None for a full rotation + // interval and audits silently fall back to legacy. + // + // After the first build, trigger an immediate neighbor-sync + // round so the new commitment gossips out within seconds. + // Without this, after a restart remote auditors keep pinning + // the pre-restart (rotated-away) hash until their normal + // sync cadence elapses — up to 1 h in the worst case, + // during which time commitment-bound audits hit "unknown + // commitment hash" -> Idle no-ops. + // ML-DSA signatures are randomized so we cannot reproduce + // the pre-restart hash; the only honest path to recovery + // is fast re-gossip. + if let Err(e) = + rebuild_and_rotate_commitment(&storage, &identity, &commitment_state, &p2p, &config) + .await + { + warn!("Initial commitment build failed: {e}"); + } else { + sync_trigger.notify_one(); + } + loop { + tokio::select! { + () = shutdown.cancelled() => break, + () = tokio::time::sleep( + std::time::Duration::from_secs(COMMITMENT_ROTATION_INTERVAL_SECS) + ) => { + if let Err(e) = rebuild_and_rotate_commitment( + &storage, + &identity, + &commitment_state, + &p2p, + &config, + ).await { + warn!("Commitment rotation failed: {e}"); + } + // Piggyback a sweep of expired recent_provers + // entries on the rotation tick (same cadence, + // 1 h). is_credited_holder already honours the + // TTL on read, but the sweep reclaims memory + // for entries we'll never re-read. + let dropped = recent_provers.write().await.sweep_expired( + std::time::Instant::now() + ); + if dropped > 0 { + debug!("recent_provers: swept {dropped} expired entries"); + } + } + } + } + debug!("Commitment rotation loop shut down"); + }); + self.task_handles.push(handle); + } + #[allow(clippy::too_many_lines, clippy::option_if_let_else)] fn start_fetch_worker(&mut self) { let p2p = Arc::clone(&self.p2p_node); @@ -774,6 +1171,9 @@ impl ReplicationEngine { let bootstrap_state = Arc::clone(&self.bootstrap_state); let is_bootstrapping = Arc::clone(&self.is_bootstrapping); let bootstrap_complete_notify = Arc::clone(&self.bootstrap_complete_notify); + let last_commitment_by_peer = Arc::clone(&self.last_commitment_by_peer); + let ever_capable_peers = Arc::clone(&self.ever_capable_peers); + let recent_provers = Arc::clone(&self.recent_provers); let handle = tokio::spawn(async move { loop { @@ -791,6 +1191,9 @@ impl ReplicationEngine { bootstrap_state: &bootstrap_state, is_bootstrapping: &is_bootstrapping, bootstrap_complete_notify: &bootstrap_complete_notify, + last_commitment_by_peer: &last_commitment_by_peer, + ever_capable_peers: &ever_capable_peers, + recent_provers: &recent_provers, }; run_verification_cycle(ctx).await; } @@ -828,6 +1231,10 @@ impl ReplicationEngine { let bootstrap_complete_notify = Arc::clone(&self.bootstrap_complete_notify); let sync_cycle_epoch = Arc::clone(&self.sync_cycle_epoch); let repair_proofs = Arc::clone(&self.repair_proofs); + let my_commitment_state = Arc::clone(&self.commitment_state); + let last_commitment_by_peer = Arc::clone(&self.last_commitment_by_peer); + let ever_capable_peers = Arc::clone(&self.ever_capable_peers); + let sig_verify_attempts = Arc::clone(&self.sig_verify_attempts); let handle = tokio::spawn(async move { // Wait for DHT bootstrap to complete before snapshotting @@ -882,12 +1289,44 @@ impl ReplicationEngine { &paid_list, &config, bootstrapping, + // Atomically snapshot + mark-gossiped: emitted in the + // bootstrap-sync request, so we stay answerable for it + // (ADR-0002). One critical section avoids a TOCTOU where a + // concurrent retire/rotate drops the slot between read and + // mark. + my_commitment_state + .current_for_gossip() + .map(|b| b.commitment().clone()), ) .await; bootstrap::decrement_pending_requests(&bootstrap_state, 1).await; if let Some(outcome) = outcome { + // Ingest the peer's piggybacked commitment from the + // response (same verification as the request path). + // Bootstrap is the FIRST gossip we receive from most + // peers, so this populates last_commitment_by_peer. + // + // We intentionally do NOT trigger a gossip-audit here: + // during bootstrap this node may itself still be + // bootstrapping (audits are gated on that), and the + // close-group/RT view is not yet stable. The peer is + // audited on the first STEADY-STATE neighbor-sync round + // after bootstrap drains (request + response paths both + // trigger), which is within one sync cycle — so caching + // the commitment here is sufficient and there is no + // coverage gap (ADR-0002). + ingest_peer_commitment( + peer, + outcome.response.commitment.as_ref(), + &p2p, + &last_commitment_by_peer, + &ever_capable_peers, + &sig_verify_attempts, + ) + .await; // sig_verify_attempts in scope from line ~1080 + if !outcome.response.bootstrapping { record_sent_replica_hints( peer, @@ -956,7 +1395,7 @@ impl ReplicationEngine { /// When `rr_message_id` is `Some`, the request arrived via the `/rr/` /// request-response path and the response must be sent via `send_response` /// so saorsa-core can route it back to the waiting `send_request` caller. -#[allow(clippy::too_many_arguments)] +#[allow(clippy::too_many_arguments, clippy::too_many_lines)] async fn handle_replication_message( source: &PeerId, data: &[u8], @@ -971,6 +1410,11 @@ async fn handle_replication_message( sync_history: &Arc>>, sync_cycle_epoch: &Arc>, repair_proofs: &Arc>, + last_commitment_by_peer: &Arc>>, + ever_capable_peers: &Arc>>, + sig_verify_attempts: &Arc>>, + my_commitment_state: &Arc, + gossip_audit: &GossipAuditTrigger, rr_message_id: Option<&str>, ) -> Result<()> { let msg = ReplicationMessage::decode(data) @@ -1004,6 +1448,22 @@ async fn handle_replication_message( } ReplicationMessageBody::NeighborSyncRequest(ref request) => { let bootstrapping = *is_bootstrapping.read().await; + // Phase-3 storage-bound audit: store the sender's + // commitment for use as `expected_commitment_hash` in + // future audits. Verify signature before storing so a peer + // cannot inject a forged commitment for someone else. + if let Some(target) = ingest_peer_commitment( + source, + request.commitment.as_ref(), + p2p_node, + last_commitment_by_peer, + ever_capable_peers, + sig_verify_attempts, + ) + .await + { + maybe_trigger_gossip_audit(gossip_audit, source, target).await; + } handle_neighbor_sync_request( source, request, @@ -1017,6 +1477,11 @@ async fn handle_replication_message( sync_history, sync_cycle_epoch, repair_proofs, + // Atomically snapshot + mark-gossiped: emitted in the sync + // response, so we must stay answerable for it (ADR-0002). + my_commitment_state + .current_for_gossip() + .map(|b| b.commitment().clone()), msg.request_id, rr_message_id, ) @@ -1046,6 +1511,10 @@ async fn handle_replication_message( .await } ReplicationMessageBody::AuditChallenge(ref challenge) => { + // Responsible-chunk audit (audit #2) responder: answer with per-key + // possession digests. This same handler also answers the + // prune-confirmation audit, which sends the same `AuditChallenge` + // wire message. let bootstrapping = *is_bootstrapping.read().await; handle_audit_challenge_msg( source, @@ -1058,12 +1527,58 @@ async fn handle_replication_message( ) .await } + ReplicationMessageBody::SubtreeAuditChallenge(ref challenge) => { + // Gossip-triggered storage-bound subtree audit (ADR-0002). + let bootstrapping = *is_bootstrapping.read().await; + let response = storage_commitment_audit::handle_subtree_challenge( + challenge, + storage, + p2p_node.peer_id(), + bootstrapping, + Some(my_commitment_state), + ) + .await; + send_replication_response( + source, + p2p_node, + msg.request_id, + ReplicationMessageBody::SubtreeAuditResponse(response), + rr_message_id, + ) + .await; + Ok(()) + } + ReplicationMessageBody::SubtreeByteChallenge(ref challenge) => { + // Round 2 of the storage audit (ADR-0002): serve the original bytes + // for the auditor's nonce-selected spot-check keys, or signal + // `Absent` for a committed key we can no longer produce. + let bootstrapping = *is_bootstrapping.read().await; + let response = storage_commitment_audit::handle_subtree_byte_challenge( + challenge, + storage, + p2p_node.peer_id(), + bootstrapping, + Some(my_commitment_state), + ) + .await; + send_replication_response( + source, + p2p_node, + msg.request_id, + ReplicationMessageBody::SubtreeByteResponse(response), + rr_message_id, + ) + .await; + Ok(()) + } // Response messages are handled by their respective request initiators. ReplicationMessageBody::FreshReplicationResponse(_) | ReplicationMessageBody::NeighborSyncResponse(_) | ReplicationMessageBody::VerificationResponse(_) | ReplicationMessageBody::FetchResponse(_) - | ReplicationMessageBody::AuditResponse(_) => Ok(()), + | ReplicationMessageBody::AuditResponse(_) + | ReplicationMessageBody::SubtreeAuditResponse(_) + | ReplicationMessageBody::SubtreeByteResponse(_) => Ok(()), } } @@ -1328,6 +1843,7 @@ async fn handle_neighbor_sync_request( sync_history: &Arc>>, sync_cycle_epoch: &Arc>, repair_proofs: &Arc>, + my_commitment: Option, request_id: u64, rr_message_id: Option<&str>, ) -> Result<()> { @@ -1349,6 +1865,7 @@ async fn handle_neighbor_sync_request( paid_list, config, is_bootstrapping, + my_commitment.clone(), ) .await; @@ -1508,6 +2025,9 @@ async fn handle_fetch_request( Ok(()) } +/// Responder for an incoming `AuditChallenge` (responsible-chunk audit #2, and +/// the prune-confirmation audit, which reuses the same wire message): reply with +/// per-key possession digests. async fn handle_audit_challenge_msg( source: &PeerId, challenge: &protocol::AuditChallenge, @@ -1640,6 +2160,11 @@ async fn run_neighbor_sync_round( repair_proofs: &Arc>, is_bootstrapping: &Arc>, bootstrap_state: &Arc>, + commitment_state: &Arc, + last_commitment_by_peer: &Arc>>, + ever_capable_peers: &Arc>>, + sig_verify_attempts: &Arc>>, + gossip_audit: &GossipAuditTrigger, ) { let self_id = *p2p_node.peer_id(); let bootstrapping = *is_bootstrapping.read().await; @@ -1677,6 +2202,7 @@ async fn run_neighbor_sync_round( repair_proofs, current_sync_epoch, allow_remote_prune_audits, + commitment_state: Some(commitment_state), }) .await; @@ -1719,6 +2245,15 @@ async fn run_neighbor_sync_round( debug!("Neighbor sync: syncing with {} peers", batch.len()); + // Snapshot our current commitment once per round so all peers in + // this batch see the same thing (gossip is the responder's attestation; + // same value across the batch is fine and reduces RwLock churn). Atomically + // snapshot + mark-gossiped so we stay answerable for exactly what we emit + // (ADR-0002 retention), with no TOCTOU vs a concurrent retire/rotate. + let my_commitment = commitment_state + .current_for_gossip() + .map(|b| b.commitment().clone()); + // Sync with each peer in the batch. for peer in &batch { let outcome = neighbor_sync::sync_with_peer_with_outcome( @@ -1728,6 +2263,7 @@ async fn run_neighbor_sync_round( paid_list, config, bootstrapping, + my_commitment.clone(), ) .await; @@ -1748,6 +2284,10 @@ async fn run_neighbor_sync_round( sync_history, sync_cycle_epoch, repair_proofs, + last_commitment_by_peer, + ever_capable_peers, + sig_verify_attempts, + gossip_audit, ) .await; } else { @@ -1766,6 +2306,7 @@ async fn run_neighbor_sync_round( paid_list, config, bootstrapping, + my_commitment.clone(), ) .await; @@ -1786,6 +2327,10 @@ async fn run_neighbor_sync_round( sync_history, sync_cycle_epoch, repair_proofs, + last_commitment_by_peer, + ever_capable_peers, + sig_verify_attempts, + gossip_audit, ) .await; } @@ -1813,7 +2358,29 @@ async fn handle_sync_response( sync_history: &Arc>>, sync_cycle_epoch: &Arc>, repair_proofs: &Arc>, + last_commitment_by_peer: &Arc>>, + ever_capable_peers: &Arc>>, + sig_verify_attempts: &Arc>>, + gossip_audit: &GossipAuditTrigger, ) { + // Ingest the peer's commitment if they piggybacked one on the response. + // Same verification as the request path (peer-id binding + signature); + // forged commitments are dropped at the edge. A *changed* commitment here + // is a gossip-audit trigger just like on the request path — so a peer that + // only ever answers sync (never initiates) is still audited (ADR-0002). + if let Some(target) = ingest_peer_commitment( + peer, + resp.commitment.as_ref(), + p2p_node, + last_commitment_by_peer, + ever_capable_peers, + sig_verify_attempts, + ) + .await + { + maybe_trigger_gossip_audit(gossip_audit, peer, target).await; + } + // Record successful sync. { let mut state = sync_state.write().await; @@ -2032,6 +2599,9 @@ async fn run_verification_cycle(ctx: VerificationCycleContext<'_>) { bootstrap_state, is_bootstrapping, bootstrap_complete_notify, + last_commitment_by_peer, + ever_capable_peers, + recent_provers, } = ctx; // Evict stale entries that have been pending too long (e.g. unreachable @@ -2170,6 +2740,83 @@ async fn run_verification_cycle(ctx: VerificationCycleContext<'_>) { // Step 3: Evaluate results — collect outcomes without holding the write // lock across paid-list I/O. + // + // v12 §6 holder-eligibility: snapshot the per-peer last-commitment + // table and recent_provers cache up front so the synchronous + // evaluate_key_evidence_with_holder_check predicate can consult + // them without awaiting. The predicate downgrades a Present + // claim to Unresolved unless the peer is credited for that key. + // Snapshot per-peer commitment data. We need two views: + // - `commitment_by_peer_snapshot`: peers that currently have + // a verified commitment record on file (used to look up + // their current hash). + // - `capable_peer_snapshot`: the sticky "ever v12-capable" + // set. Sourced from a separate set rather than the + // commitment map so eviction (PeerRemoved cleanup, sybil + // cap at `MAX_LAST_COMMITMENT_BY_PEER`) does NOT downgrade + // a previously-v12 peer to "legacy" credit-unconditionally. + // Legacy / pre-v12 peers that have never sent a commitment + // remain absent from the set and are credited via the + // legacy path so mixed-version networks stay live. + let commitment_by_peer_snapshot: HashMap = { + let map = last_commitment_by_peer.read().await; + map.iter() + .filter_map(|(p, rec)| { + rec.last_commitment.as_ref().and_then(|c| { + crate::replication::commitment::commitment_hash(c).map(|h| (*p, h)) + }) + }) + .collect() + }; + let capable_peer_snapshot: HashSet = ever_capable_peers.read().await.clone(); + // Take a full snapshot of recent_provers under the read lock, + // then release. The cache is bounded (16/key × keys), so the + // clone is cheap. + let provers_snapshot = recent_provers.read().await.clone(); + // For the replica-fetch path, we need to know whether THIS + // node already holds the key being verified. The v12 §6 + // holder-credit gate is meant to prevent uncredited Present + // claims from contributing to paid-list / reward quorum for + // keys we DO hold (and could audit ourselves). For keys we + // are trying to FETCH (i.e. not in local storage), there is + // no possible local audit credit, and gating the presence + // quorum on credit would deadlock replica-repair in a + // fully v12-capable close group. + let mut locally_held: HashSet = HashSet::new(); + for key in &keys_needing_network { + if storage.exists(key).unwrap_or(false) { + locally_held.insert(*key); + } + } + let holder_credit = |peer: &PeerId, key: &XorName| -> bool { + if !locally_held.contains(key) { + // Replica-fetch path: we don't hold this key, so we + // cannot have collected audit credit for it. Trust + // Present claims to drive fetch-source promotion; + // chunk-PUT payment_verifier is the security backstop + // when the bytes actually arrive. + return true; + } + if !capable_peer_snapshot.contains(peer) { + // Pre-v12 / legacy peer that has never gossiped a + // commitment. The v12 §6 holder-eligibility check + // doesn't apply: their Present evidence comes through + // the legacy path and we credit it unconditionally + // so a mixed-version network stays live during + // transition. + return true; + } + let Some(hash) = commitment_by_peer_snapshot.get(peer) else { + // Peer is commitment_capable (sticky) but currently + // has no live commitment record on file (e.g. their + // last gossip was evicted from the LRU cache, or it + // failed verification). Withhold credit until they + // re-prove storage under a fresh commitment. + return false; + }; + provers_snapshot.is_credited_holder(key, peer, hash) + }; + let mut evaluated: Vec<(XorName, KeyVerificationOutcome, HintPipeline)> = Vec::new(); { let q = queues.read().await; @@ -2180,7 +2827,13 @@ async fn run_verification_cycle(ctx: VerificationCycleContext<'_>) { let Some(entry) = q.get_pending(key) else { continue; }; - let outcome = quorum::evaluate_key_evidence(key, ev, &targets, config); + let outcome = quorum::evaluate_key_evidence_with_holder_check( + key, + ev, + &targets, + config, + holder_credit, + ); evaluated.push((*key, outcome, entry.pipeline)); } } // read lock released @@ -2556,11 +3209,104 @@ fn first_failed_key_label(confirmed_failed_keys: &[XorName]) -> String { ) } +/// Execute the side effects for a confirmed storage-commitment audit failure. +/// +/// [`plan_failed_audit`] is the pure decision INCLUDING the strike selection +/// (record-a-strike-for-`Timeout` vs leave-untouched for confirmed failures), +/// extracted so the whole glue — not just the verdict — is testable without a +/// live `P2PNode`. This function is only the resulting I/O. Timeouts are graced +/// and rollout-gated (TIMEOUT-EVICTION-DISABLED); confirmed failures penalize on +/// the first occurrence and revoke holder credit. +async fn handle_failed_audit( + challenged_peer: &PeerId, + confirmed_failed_key_count: usize, + reason: &AuditFailureReason, + p2p_node: &Arc, + sync_state: &Arc>, + recent_provers: &Arc>, + audit_timeout_strikes: &Arc>>, +) { + let action = { + let mut strikes = audit_timeout_strikes.write().await; + plan_failed_audit(reason, &mut strikes, challenged_peer) + }; + match action { + AuditFailureAction::TimeoutGrace => { + // Honest transient slowness: no penalty, no credit loss, retain the + // bootstrap claim. Only *sustained* timeouts (a peer that always + // has to refetch) survive to the threshold — the per-challenge + // window is never widened. + debug!( + "Audit timeout for {challenged_peer} (under the {}-strike threshold); \ + within grace, retaining bootstrap claim, no penalty", + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + } + AuditFailureAction::TimeoutPenalize => { + // TIMEOUT-EVICTION-DISABLED: re-enable once enough nodes have + // upgraded. This PR is a breaking wire change (StorageCommitment + // gossip old nodes cannot decode), so a pre-upgrade node times out + // on every new audit and looks exactly like a non-storing peer. + // Penalising timeouts now would make upgraded nodes evict every + // not-yet-upgraded node — a network death spiral during rollout. + // Strikes are still tracked/logged so the mechanism stays + // observable; we just don't report the trust event that drives + // eviction. Confirmed storage-integrity failures (ConfirmedPenalize + // below) are unaffected — those only come from a peer that actually + // answered with bad data, never an old node. Grep + // TIMEOUT-EVICTION-DISABLED to restore the report in a small + // follow-up release. + warn!( + "Audit timeout for {challenged_peer}: reached the {}-strike threshold of \ + consecutive timeouts (eviction disabled this release — not penalizing)", + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + // p2p_node + // .report_trust_event( + // challenged_peer, + // TrustEvent::ApplicationFailure(config::AUDIT_FAILURE_TRUST_WEIGHT), + // ) + // .await; + } + AuditFailureAction::ConfirmedPenalize => { + // The caller (handle_subtree_audit_result) already logged the rich + // failure line with reason + per-category summary; avoid a redundant + // second error log here. `confirmed_failed_key_count` is retained in + // the signature for callers/tests that assert on it. + let _ = confirmed_failed_key_count; + // Peer returned a non-bootstrap response — clear the active claim + // while retaining claim history. + { + let mut state = sync_state.write().await; + state.clear_active_bootstrap_claim(challenged_peer); + } + // Revoke holder credit on a CONFIRMED failure (DigestMismatch / + // KeyAbsent / Rejected / MalformedResponse): the peer no longer + // provably holds what it committed to, so it must not keep §6 + // holder credit for the proof TTL. The §5 `forget_commitment` path + // only fires on an "unknown commitment hash" reply; genuine byte + // loss surfaces here. + { + let mut provers_guard = recent_provers.write().await; + apply_audit_failure_credit_revocation(&mut provers_guard, challenged_peer, reason); + } + p2p_node + .report_trust_event( + challenged_peer, + TrustEvent::ApplicationFailure(config::AUDIT_FAILURE_TRUST_WEIGHT), + ) + .await; + } + } +} + /// Handle audit result: log findings and emit trust events. -async fn handle_audit_result( +async fn handle_subtree_audit_result( result: &AuditTickResult, p2p_node: &Arc, sync_state: &Arc>, + recent_provers: &Arc>, + audit_timeout_strikes: &Arc>>, config: &ReplicationConfig, ) { match result { @@ -2575,6 +3321,14 @@ async fn handle_audit_result( let mut state = sync_state.write().await; state.clear_active_bootstrap_claim(challenged_peer); } + // A normal response proves the slowness (if any) was transient, so + // reset the timeout-strike counter. Only *sustained* timeouts (a + // peer that must refetch on every audit) survive this reset to + // accumulate toward the penalty threshold. + { + let mut strikes = audit_timeout_strikes.write().await; + strikes.remove(challenged_peer); + } p2p_node .report_trust_event( challenged_peer, @@ -2591,6 +3345,8 @@ async fn handle_audit_result( .. } = evidence { + // Rich diagnostics (from main's audit-failure logging) + the + // first-failed-key correlation handle. let first_failed_key = first_failed_key_label(confirmed_failed_keys); error!( "Audit failure for {challenged_peer}: reason={reason:?}, confirmed_failed_keys={}, challenged_keys={}, absent_keys={}, digest_mismatch_keys={}, first_failed_key={first_failed_key}", @@ -2599,20 +3355,22 @@ async fn handle_audit_result( summary.absent_keys, summary.digest_mismatch_keys, ); - if audit_failure_clears_bootstrap_claim(reason) { - // Peer returned a non-bootstrap response — clear the active - // claim while retaining claim history. - let mut state = sync_state.write().await; - state.clear_active_bootstrap_claim(challenged_peer); - } else { - debug!("Audit timeout for {challenged_peer}; retaining active bootstrap claim"); - } - p2p_node - .report_trust_event( - challenged_peer, - TrustEvent::ApplicationFailure(config::AUDIT_FAILURE_TRUST_WEIGHT), - ) - .await; + // Route the side effects through the strike-grace path: timeouts + // are graced (and rollout-gated by TIMEOUT-EVICTION-DISABLED), + // deterministic failures penalize on the first occurrence and + // revoke holder credit. Do NOT report ApplicationFailure inline + // here — that would evict honest not-yet-upgraded peers on a + // single timeout during the breaking rollout. + handle_failed_audit( + challenged_peer, + confirmed_failed_keys.len(), + reason, + p2p_node, + sync_state, + recent_provers, + audit_timeout_strikes, + ) + .await; } } AuditTickResult::BootstrapClaim { peer } => { @@ -2660,23 +3418,1457 @@ async fn handle_audit_result( } } +/// Whether a confirmed audit failure with this reason clears the peer's active +/// bootstrap claim. A `Timeout` does not (the peer may still be legitimately +/// bootstrapping); every confirmed storage-integrity reason does. Used by the +/// responsible-chunk audit's [`handle_audit_result`]; the storage-commitment +/// audit's [`handle_subtree_audit_result`] special-cases `Timeout` directly. fn audit_failure_clears_bootstrap_claim(reason: &AuditFailureReason) -> bool { !matches!(reason, AuditFailureReason::Timeout) } -// `admit_bootstrap_hints` was consolidated into `admit_and_queue_hints`. - -#[cfg(test)] -#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] -mod tests { - use super::{audit_failure_clears_bootstrap_claim, first_failed_key_label}; - use crate::replication::types::AuditFailureReason; - - #[test] - fn audit_timeout_preserves_active_bootstrap_claim() { - assert!(!audit_failure_clears_bootstrap_claim( - &AuditFailureReason::Timeout - )); +/// Handle the result of a responsible-chunk audit tick (audit #2): emit trust +/// events and manage bootstrap-claim state. This is the original (pre-ADR-0002) +/// audit-result handler, restored alongside the gossip-triggered +/// storage-commitment audit's [`handle_subtree_audit_result`]. +async fn handle_audit_result( + result: &AuditTickResult, + p2p_node: &Arc, + sync_state: &Arc>, + config: &ReplicationConfig, +) { + match result { + AuditTickResult::Passed { + challenged_peer, + keys_checked, + } => { + debug!("Audit passed for {challenged_peer} ({keys_checked} keys)"); + // Peer responded normally — clear the active bootstrap claim while + // retaining history so a later claim is treated as repeated abuse. + { + let mut state = sync_state.write().await; + state.clear_active_bootstrap_claim(challenged_peer); + } + p2p_node + .report_trust_event( + challenged_peer, + TrustEvent::ApplicationSuccess(REPLICATION_TRUST_WEIGHT), + ) + .await; + } + AuditTickResult::Failed { evidence } => { + if let FailureEvidence::AuditFailure { + challenged_peer, + confirmed_failed_keys, + summary, + reason, + .. + } = evidence + { + // Rich audit-failure diagnostics (preserved from main's + // audit-failure logging): per-category counts + first-failed-key + // correlation handle. + let first_failed_key = first_failed_key_label(confirmed_failed_keys); + error!( + "Audit failure for {challenged_peer}: reason={reason:?}, confirmed_failed_keys={}, challenged_keys={}, absent_keys={}, digest_mismatch_keys={}, first_failed_key={first_failed_key}", + confirmed_failed_keys.len(), + summary.challenged_keys, + summary.absent_keys, + summary.digest_mismatch_keys, + ); + if audit_failure_clears_bootstrap_claim(reason) { + // Peer returned a non-bootstrap response — clear the active + // claim while retaining claim history. + let mut state = sync_state.write().await; + state.clear_active_bootstrap_claim(challenged_peer); + } else { + debug!("Audit timeout for {challenged_peer}; retaining active bootstrap claim"); + } + p2p_node + .report_trust_event( + challenged_peer, + TrustEvent::ApplicationFailure(config::AUDIT_FAILURE_TRUST_WEIGHT), + ) + .await; + } + } + AuditTickResult::BootstrapClaim { peer } => { + // Gap 6: BootstrapClaimAbuse grace period in audit path. + // Separate state mutation from network I/O to avoid holding the + // write lock across report_trust_event. + let should_report = { + let now = Instant::now(); + let mut state = sync_state.write().await; + match state.observe_bootstrap_claim(*peer, now, config.bootstrap_claim_grace_period) + { + BootstrapClaimObservation::WithinGrace { .. } => { + debug!("Audit: peer {peer} claims bootstrapping (within grace period)"); + false + } + BootstrapClaimObservation::PastGrace { first_seen } => { + warn!( + "Audit: peer {peer} claiming bootstrap past grace period \ + ({:?} > {:?}), reporting abuse", + now.duration_since(first_seen), + config.bootstrap_claim_grace_period, + ); + true + } + BootstrapClaimObservation::Repeated { first_seen } => { + warn!( + "Audit: peer {peer} repeated bootstrap claim after previously \ + stopping; first claim was {:?} ago, reporting abuse", + now.duration_since(first_seen), + ); + true + } + } + }; + if should_report { + p2p_node + .report_trust_event( + peer, + TrustEvent::ApplicationFailure(REPLICATION_TRUST_WEIGHT), + ) + .await; + } + } + AuditTickResult::Idle | AuditTickResult::InsufficientKeys => {} + } +} + +/// What the audit-failure handler should do for a given failure, given the +/// peer's post-increment timeout-strike count. Pure (no I/O) so the whole +/// decision can be exercised end-to-end without a live `P2PNode`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AuditFailureAction { + /// Timeout under the strike threshold: no trust penalty, no credit + /// revocation, retain the bootstrap claim (honest transient slowness). + TimeoutGrace, + /// Timeout at/over the threshold: report `ApplicationFailure`. Bootstrap + /// claim retained; holder credit NOT revoked (the peer never admitted byte + /// loss). The non-storing-peer case. + TimeoutPenalize, + /// Confirmed storage-integrity failure: penalize immediately, clear the + /// active bootstrap claim, and revoke holder credit. + ConfirmedPenalize, +} + +/// Upper bound on a peer's consecutive-timeout strike count. Must exceed the +/// largest reachable adaptive threshold (base + `MAX_ADAPTIVE_TIMEOUT_GRACE`) so +/// a genuinely non-responsive peer's count can always catch up to and cross an +/// inflated threshold — otherwise capping at the base would make timeout +/// penalties unreachable once the adaptive threshold rose. +const AUDIT_TIMEOUT_STRIKE_MAX: u32 = 64; + +/// Maximum extra grace the adaptive mechanism may add on top of the base +/// threshold. Bounds how far a (possibly stale) set of timing-out peers can +/// widen the window, so a small persistent failing cohort cannot push the +/// threshold arbitrarily high and shield a bad node indefinitely. +const MAX_ADAPTIVE_TIMEOUT_GRACE: u32 = 2 * config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + +/// Record an audit timeout for `peer` and return its new consecutive-timeout +/// strike count, saturating at [`AUDIT_TIMEOUT_STRIKE_MAX`] (well above any +/// reachable adaptive threshold). A successful audit removes the peer's entry +/// (the `Passed` arm of [`handle_subtree_audit_result`]), so only *consecutive* +/// timeouts accumulate here. +fn record_audit_timeout_strike(strikes: &mut HashMap, peer: &PeerId) -> u32 { + let count = strikes.entry(*peer).or_insert(0); + *count = count.saturating_add(1).min(AUDIT_TIMEOUT_STRIKE_MAX); + *count +} + +/// The adaptive timeout-strike threshold for judging `peer` (ADR-0002 "Network +/// Resilience"): `min(median of the OTHER timing-out peers' counts, +/// MAX_ADAPTIVE_TIMEOUT_GRACE) + base threshold`. +/// +/// In a healthy network almost no peer carries timeout strikes, so the median +/// is 0 and the threshold is the base [`config::AUDIT_TIMEOUT_STRIKE_THRESHOLD`]. +/// During genuine disruption many *honest* peers time out together, lifting the +/// median and widening the grace so the audit system does not pile onto a +/// struggling network — but the widening is capped at `MAX_ADAPTIVE_TIMEOUT_GRACE` +/// so a stale failing cohort cannot inflate it without bound. +/// +/// `peer` is EXCLUDED from the median so a lone timing-out peer cannot raise its +/// own grace bar. Combined with the map being fed ONLY by timeouts (deterministic +/// failures never touch it), this closes self-inflation and bounds +/// attacker-inflation of the grace window. +fn adaptive_timeout_threshold(strikes: &HashMap, peer: &PeerId) -> u32 { + let grace = median_timeout_strikes_excluding(strikes, peer).min(MAX_ADAPTIVE_TIMEOUT_GRACE); + grace.saturating_add(config::AUDIT_TIMEOUT_STRIKE_THRESHOLD) +} + +/// Lower median of the current per-peer consecutive-timeout counts, excluding +/// `peer`. No other peers → 0. +fn median_timeout_strikes_excluding(strikes: &HashMap, peer: &PeerId) -> u32 { + let mut counts: Vec = strikes + .iter() + .filter(|(p, _)| *p != peer) + .map(|(_, c)| *c) + .collect(); + if counts.is_empty() { + return 0; + } + counts.sort_unstable(); + // Lower median: for even-sized inputs take the lower of the two middle + // values ((len-1)/2), so the grace is conservative rather than inflated. + counts.get((counts.len() - 1) / 2).copied().unwrap_or(0) +} + +/// Whether a peer's consecutive-timeout strike count reaches the (adaptive) +/// threshold for emitting an `ApplicationFailure` trust event. +fn timeout_strike_reaches_threshold(strikes: u32, threshold: u32) -> bool { + strikes >= threshold +} + +/// Decide what to do about a confirmed audit failure. `timeout_strikes_after` +/// is the peer's strike count after recording this event and `timeout_threshold` +/// the adaptive threshold to compare against (both only meaningful when +/// `reason == Timeout`). Pure, so the integration-level decision can be asserted +/// in tests with no networking. +fn decide_audit_failure_action( + reason: &AuditFailureReason, + timeout_strikes_after: u32, + timeout_threshold: u32, +) -> AuditFailureAction { + if matches!(reason, AuditFailureReason::Timeout) { + if timeout_strike_reaches_threshold(timeout_strikes_after, timeout_threshold) { + AuditFailureAction::TimeoutPenalize + } else { + AuditFailureAction::TimeoutGrace + } + } else { + AuditFailureAction::ConfirmedPenalize + } +} + +/// Plan the response to a confirmed audit failure, performing the +/// strike-selection glue in-process: a `Timeout` records a strike against +/// `peer` (so consecutive timeouts accumulate) and is judged against the +/// adaptive threshold; every other reason is a confirmed failure that does NOT +/// touch the strike map. The caller owns the lock and performs the resulting I/O. +fn plan_failed_audit( + reason: &AuditFailureReason, + strikes: &mut HashMap, + peer: &PeerId, +) -> AuditFailureAction { + // Snapshot the adaptive threshold from the *other* peers' counts (excluding + // this peer), so a single peer's own timeouts cannot raise its own grace bar. + let threshold = adaptive_timeout_threshold(strikes, peer); + let strikes_after = if matches!(reason, AuditFailureReason::Timeout) { + record_audit_timeout_strike(strikes, peer) + } else { + 0 + }; + decide_audit_failure_action(reason, strikes_after, threshold) +} + +/// Whether a confirmed audit failure with this reason should revoke the +/// peer's `recent_provers` holder credit immediately (v12 §6). +/// +/// `true` for any reason where the peer actually answered (or admitted +/// it cannot): `DigestMismatch`, `KeyAbsent`, `Rejected` ("missing +/// bytes for committed key"), `MalformedResponse` — these prove the +/// peer no longer holds what it committed to, so it must not keep +/// holder credit for the proof TTL. `false` for `Timeout`: a single +/// dropped packet must not strip an honest peer; the 40-min TTL is the +/// deliberate liveness cushion there. +fn audit_failure_revokes_holder_credit(reason: &AuditFailureReason) -> bool { + !matches!(reason, AuditFailureReason::Timeout) +} + +/// Apply the holder-credit revocation decision for a confirmed audit +/// failure. Pure over `RecentProvers` so the handler wiring is unit- +/// testable without a live `P2PNode`: the production `Failed` arm of +/// `handle_subtree_audit_result` calls exactly this. +fn apply_audit_failure_credit_revocation( + provers: &mut RecentProvers, + challenged_peer: &PeerId, + reason: &AuditFailureReason, +) { + if audit_failure_revokes_holder_credit(reason) { + provers.forget_peer(challenged_peer); + } +} + +// `admit_bootstrap_hints` was consolidated into `admit_and_queue_hints`. + +// --------------------------------------------------------------------------- +// Storage-bound audit (ADR-0002) — gossip trigger + auditor-side ingestion +// --------------------------------------------------------------------------- + +/// State the gossip-audit trigger needs to spawn an audit. Bundled so the +/// message handler passes one value instead of a long argument list; all +/// fields are cheap `Arc` clones. +#[derive(Clone)] +struct GossipAuditTrigger { + p2p_node: Arc, + config: Arc, + recent_provers: Arc>, + sync_state: Arc>, + audit_timeout_strikes: Arc>>, + cooldown: Arc>>, +} + +/// What a gossip ingest yields for the audit trigger: the commitment hash to +/// pin and the `key_count` needed to size the response deadline from the actual +/// `ceil(sqrt(N))` subtree (ADR-0002). Returned on every VALID gossip (changed +/// or not) so a stable-keyset node stays auditable — not just on its first +/// commitment. +#[derive(Debug, Clone, Copy)] +struct AuditTarget { + pin_hash: [u8; 32], + key_count: u32, +} + +/// Per-peer audit cooldown check-and-stamp (ADR-0002 "occasional surprise +/// exams, keeps load low"). Returns `true` if `peer` may be audited now (and +/// stamps `now`), `false` if it was audited within +/// `AUDIT_ON_GOSSIP_COOLDOWN_SECS`. Bounds the map under a flood of distinct +/// peers. Pure over the passed map so the flood/cooldown behaviour is testable +/// without a live node: a burst of gossips from one peer yields at most one +/// `true` per cooldown window. +fn cooldown_allows_audit(map: &mut HashMap, peer: &PeerId, now: Instant) -> bool { + let cooldown = Duration::from_secs(config::AUDIT_ON_GOSSIP_COOLDOWN_SECS); + let known = match map.get(peer) { + Some(&last) => { + if now.saturating_duration_since(last) < cooldown { + return false; + } + true + } + None => false, + }; + // Bound the map under churn like its siblings (drop the oldest stamp) before + // admitting a brand-new peer. + if !known && map.len() >= MAX_LAST_COMMITMENT_BY_PEER { + if let Some(victim) = map.iter().min_by_key(|(_, &ts)| ts).map(|(p, _)| *p) { + map.remove(&victim); + } + } + map.insert(*peer, now); + true +} + +/// The gossip-audit launch decision in ONE place so the ordering is shared +/// between production and its test (ADR-0002 "occasional surprise exams"). +/// +/// Order matters and is the security-relevant property: the per-peer cooldown is +/// checked-and-stamped FIRST, THEN the probability lottery (`lottery_wins`) is +/// applied. If the lottery were sampled first, a gossip flood would re-roll it on +/// every message until one won, multiplying audits. Because the cooldown is +/// stamped before the lottery is consulted, a LOSING ticket still consumes the +/// window — so each peer gets at most one audit lottery per cooldown window +/// regardless of how often it gossips. Production calls this with +/// `lottery_wins = gen_bool(AUDIT_ON_GOSSIP_PROBABILITY)`; the test calls it with +/// a deterministic `lottery_wins`, so a reorder regression here fails the test. +fn audit_launch_decision( + map: &mut HashMap, + peer: &PeerId, + now: Instant, + lottery_wins: bool, +) -> bool { + // Gate 1: cooldown check-and-stamp (consumes the window even on a loss). + if !cooldown_allows_audit(map, peer, now) { + return false; + } + // Gate 2: the probability lottery. + lottery_wins +} + +/// On a peer's *changed* gossiped commitment, maybe launch a subtree audit +/// (ADR-0002): fire with probability `AUDIT_ON_GOSSIP_PROBABILITY`, subject to a +/// per-peer cooldown, pinned to the just-ingested root. Detached so gossip +/// handling is never blocked on a network round-trip. +async fn maybe_trigger_gossip_audit( + trigger: &GossipAuditTrigger, + peer: &PeerId, + target: AuditTarget, +) { + // The launch decision (cooldown-then-lottery ordering) lives in the pure + // `audit_launch_decision` so the ordering is shared with its test. Sample + // the lottery here, then let the helper apply it AFTER the cooldown stamp. + let now = Instant::now(); + let lottery_wins = rand::thread_rng().gen_bool(config::AUDIT_ON_GOSSIP_PROBABILITY); + { + let mut map = trigger.cooldown.write().await; + if !audit_launch_decision(&mut map, peer, now, lottery_wins) { + return; + } + } + + let trigger = trigger.clone(); + let peer = *peer; + tokio::spawn(async move { + let credit = storage_commitment_audit::AuditCredit { + recent_provers: &trigger.recent_provers, + }; + let result = storage_commitment_audit::run_subtree_audit( + &trigger.p2p_node, + &trigger.config, + &peer, + target.pin_hash, + target.key_count, + Some(&credit), + ) + .await; + handle_subtree_audit_result( + &result, + &trigger.p2p_node, + &trigger.sync_state, + &trigger.recent_provers, + &trigger.audit_timeout_strikes, + &trigger.config, + ) + .await; + }); +} + +/// Atomic check-and-stamp of the per-peer commitment sig-verify rate limit. +/// +/// Returns `true` if a signature verify is allowed now (and stamps the attempt +/// time), `false` if the peer is within [`COMMITMENT_SIG_VERIFY_MIN_INTERVAL`] +/// of its last attempt. Holds one write lock across the decision so two +/// concurrent ingests from the same peer cannot both pass. Stamps BEFORE the +/// caller's expensive verify so a slow/failed verify still rate-limits the next +/// message. Bounds the map under a flood of distinct peer ids. +async fn sig_verify_rate_limit_ok( + sig_verify_attempts: &Arc>>, + source: &PeerId, + now: Instant, +) -> bool { + let mut attempts = sig_verify_attempts.write().await; + if let Some(&last) = attempts.get(source) { + if now.saturating_duration_since(last) < COMMITMENT_SIG_VERIFY_MIN_INTERVAL { + return false; + } + } + if attempts.len() >= MAX_LAST_COMMITMENT_BY_PEER && !attempts.contains_key(source) { + if let Some(victim) = attempts.iter().min_by_key(|(_, &ts)| ts).map(|(p, _)| *p) { + attempts.remove(&victim); + } + } + attempts.insert(*source, now); + true +} + +/// Verify + store an inbound commitment from a gossip peer. +/// +/// Called from the inbound `NeighborSyncRequest`/`Response` handlers and +/// the bootstrap-sync loop. Drops the commitment unless all five gates +/// pass: +/// 1. `source` is in our DHT routing table (sybil/churn cap). +/// 2. `commitment.sender_peer_id == source.as_bytes()` (peer-id +/// binding to the authenticated transport peer). +/// 3. `BLAKE3(commitment.sender_public_key) == commitment.sender_peer_id` +/// (the embedded pubkey actually belongs to the claimed identity — +/// saorsa-core derives `PeerId = BLAKE3(pubkey)`). +/// 4. `verify_commitment_signature(commitment)` succeeds against the +/// embedded public key. The signed payload binds the pubkey, so an +/// adversary cannot swap the key while keeping the body. +/// 5. The cache has room or this is an update for an existing entry +/// (sybil cap, `MAX_LAST_COMMITMENT_BY_PEER`). +/// +/// On all-pass, the commitment is stored as the auditor's per-peer +/// "last known commitment" for use as `expected_commitment_hash` in +/// future audits. +/// +/// Failures (no commitment / mismatched peer id / bad signature) are +/// silent drops — gossip is best-effort and a malformed commitment from +/// one peer should not affect anything else. +/// +/// Returns `Some(AuditTarget)` whenever a VALID commitment was stored (whether +/// or not its root changed), so the caller can run a probabilistic, +/// cooldown-gated subtree audit. Returning on *every* valid gossip — not only +/// changed ones — is deliberate (ADR-0002): a node with a stable key set keeps +/// being auditable, so it cannot pass one audit and then delete data while +/// re-gossiping the same root forever. The cooldown + probability bound the +/// audit frequency. Returns `None` only if the commitment was dropped (failed a +/// gate) or there is nothing to pin. +/// +/// Handle a capable peer gossiping `None` (a commitment downgrade). +/// +/// A capable peer that previously gossiped a commitment but now gossips `None` +/// is trying to drop off the audit path. Within the answerability window we keep +/// the cached commitment pinned AND return it as an audit target so this gossip +/// still schedules a subtree audit against the peer's last known commitment — if +/// it genuinely dropped the data, the audit fails (there is no periodic tick, so +/// the trigger MUST fire here or the downgrade is never re-challenged). +/// +/// But this only holds within the SAME `GOSSIP_ANSWERABILITY_TTL` the responder +/// honours for its own retired commitment: once that elapses since we last +/// received the peer's commitment, an honest peer has legitimately retired that +/// root (its responder side `retire_current`s and lets it age out) and can no +/// longer answer a pin on it. Auditing it past the TTL would manufacture a false +/// failure, so we then forget the cached commitment (keeping the sticky +/// `commitment_capable` bit) and stop pinning it. +async fn handle_commitment_downgrade( + source: &PeerId, + last_commitment_by_peer: &Arc>>, +) -> Option { + let now = Instant::now(); + let cached = { + let map = last_commitment_by_peer.read().await; + map.get(source).and_then(|rec| { + if !rec.commitment_capable { + return None; + } + let last = rec.last_commitment.as_ref()?; + let pin = commitment_hash(last)?; + let fresh = now.saturating_duration_since(rec.received_at) + < crate::replication::commitment_state::GOSSIP_ANSWERABILITY_TTL; + Some((pin, last.key_count, fresh)) + }) + }; + match cached { + Some((pin, key_count, true)) => { + warn!( + "ingest_peer_commitment: commitment-capable peer {source} sent None \ + (downgrade attempt); auditing against its last cached commitment" + ); + Some(AuditTarget { + pin_hash: pin, + key_count, + }) + } + Some((_, _, false)) => { + // Cached commitment has aged past the answerability window — forget + // it so we stop pinning a root the peer is no longer obliged to + // answer. Keep `commitment_capable` (sticky). Re-check freshness + // UNDER the write lock (compare-and-clear): a concurrent valid gossip + // from this peer may have refreshed `received_at` in the gap between + // our read and write locks; if so, leave its fresh commitment intact. + if let Some(rec) = last_commitment_by_peer.write().await.get_mut(source) { + let still_stale = now.saturating_duration_since(rec.received_at) + >= crate::replication::commitment_state::GOSSIP_ANSWERABILITY_TTL; + if still_stale { + rec.last_commitment = None; + debug!( + "ingest_peer_commitment: capable peer {source} sent None and its cached \ + commitment aged past the answerability TTL; forgetting it" + ); + } + } + None + } + None => None, + } +} + +async fn ingest_peer_commitment( + source: &PeerId, + commitment: Option<&StorageCommitment>, + p2p_node: &Arc, + last_commitment_by_peer: &Arc>>, + ever_capable_peers: &Arc>>, + sig_verify_attempts: &Arc>>, +) -> Option { + let Some(c) = commitment else { + return handle_commitment_downgrade(source, last_commitment_by_peer).await; + }; + // RT-membership gate: only accept commitments from peers in our + // routing table. Off-RT senders (sybils, drive-by relays) cannot + // populate the cache, which closes the hole where a flood of + // off-RT identities could fill the cap and evict honest + // peers. The neighbor-sync request handler applies the same gate + // before admitting inbound replication hints (see neighbor_sync.rs + // `sender_in_rt`); we mirror that policy here for the commitment + // piggyback. + if !p2p_node.dht_manager().is_in_routing_table(source).await { + debug!("ingest_peer_commitment: source {source} not in routing table (dropped)"); + return None; + } + // Peer-id binding: the commitment's claimed sender must match the + // authenticated transport peer (`source`). Defeats relay/replay + // and also pins which embedded public key we are about to verify + // against — the verify itself trusts the embedded key, so the + // peer-id binding is the link to a real identity. + if &c.sender_peer_id != source.as_bytes() { + warn!( + "ingest_peer_commitment: sender_peer_id mismatch from {source} \ + (dropped, possible relay attempt)" + ); + return None; + } + // Peer-id to embedded-pubkey binding: saorsa-core derives PeerId as + // BLAKE3(pubkey_bytes). Without this check, a responder could sign + // with a throwaway key they own and lie about which identity it + // belongs to (the embedded-key signature would verify trivially). + let derived_peer_id = *blake3::hash(&c.sender_public_key).as_bytes(); + if derived_peer_id != c.sender_peer_id { + warn!( + "ingest_peer_commitment: embedded pubkey does not hash to claimed peer_id for \ + {source} (dropped, throwaway-key attack)" + ); + return None; + } + // §2 step 3 + §11 DoS: rate-limit per-peer to at most one ML-DSA + // signature verify per `COMMITMENT_SIG_VERIFY_MIN_INTERVAL`. A + // sybil/RT-membership-bypassing peer that flooded valid-looking + // gossip would otherwise burn CPU on every message. The rate + // limit is checked AFTER cheap structural gates (RT, peer-id + // binding, pubkey-binding) and BEFORE the expensive sig verify. + // + // Tracked in `sig_verify_attempts` (separate from + // last_commitment_by_peer) so EVERY attempt — successful or not — + // bumps the rate-limit clock. Reading only from PeerCommitmentRecord + // would skip the cap for peers we've never successfully verified, + // letting a flood of invalid-but-structurally-plausible gossips + // burn CPU. + let now = Instant::now(); + if !sig_verify_rate_limit_ok(sig_verify_attempts, source, now).await { + debug!( + "ingest_peer_commitment: rate-limited sig verify from {source} \ + (< {COMMITMENT_SIG_VERIFY_MIN_INTERVAL:?} since last attempt); dropped" + ); + return None; + } + // Signature verify, using the public key embedded in the commitment + // itself. The pubkey is bound by the signature payload (see + // commitment_signed_payload) so an adversary cannot keep the body + // and swap the key to one they hold the secret for. + if !crate::replication::commitment::verify_commitment_signature(c) { + warn!( + "ingest_peer_commitment: signature did not verify under embedded key for {source} \ + (dropped, forged commitment)" + ); + return None; + } + // The new commitment's hash, used to store and to pin for the audit target. + let new_hash = commitment_hash(c); + let mut map = last_commitment_by_peer.write().await; + // Sybil/churn cap: if we're at the hard cap AND this is a new peer, + // evict an arbitrary existing entry to make room. Updates for peers + // already in the map are always accepted (they replace, not grow). + if map.len() >= MAX_LAST_COMMITMENT_BY_PEER && !map.contains_key(source) { + // Drop one arbitrary entry. HashMap iter order is random which + // is fine — over time PeerRemoved cleanup keeps the working set + // anchored on the real RT membership; this cap only fires under + // active flooding attempts. + if let Some(victim) = map.keys().next().copied() { + map.remove(&victim); + warn!( + "ingest_peer_commitment: cache full ({MAX_LAST_COMMITMENT_BY_PEER}); \ + evicted {victim} to admit {source}" + ); + } + } + // Preserve sticky commitment_capable across updates — once true, + // always true. New entries start with capable = true (we just + // verified a valid commitment from this peer). + map.entry(*source) + .and_modify(|r| { + r.last_commitment = Some(c.clone()); + r.received_at = now; + r.last_sig_verify_at = now; + r.commitment_capable = true; // sticky-redundant but explicit + }) + .or_insert_with(|| PeerCommitmentRecord::from_verified(c.clone(), now)); + drop(map); + // Record the sticky "ever v12-capable" bit in a set independent of + // `last_commitment_by_peer` (whose entries can be evicted by + // `PeerRemoved` and the sybil cap). This is what the §3 audit + // shield and the §6 holder-eligibility closure consult to decide + // whether the peer is expected to speak v12. + // + // Capped at `MAX_EVER_CAPABLE_PEERS` to bound memory under + // identity-rotation attacks: once full, new entries are refused. + // Refusal degrades over-cap peers to the behaviour before this set + // existed (treated as legacy on rejoin), which is not a security + // regression and preserves the historic set stable. + { + let mut set = ever_capable_peers.write().await; + if set.contains(source) || set.len() < MAX_EVER_CAPABLE_PEERS { + set.insert(*source); + } else { + warn!( + "ingest_peer_commitment: ever_capable_peers at cap \ + ({MAX_EVER_CAPABLE_PEERS}); refusing to record {source} as sticky-capable" + ); + } + } + // Return an audit target for EVERY valid stored commitment (changed or + // not), so the caller's cooldown+probability-gated trigger keeps a + // stable-keyset peer auditable over time (ADR-0002). Only a serialization + // failure (new_hash == None, unreachable for a real commitment) yields None. + new_hash.map(|pin_hash| AuditTarget { + pin_hash, + key_count: c.key_count, + }) +} + +// --------------------------------------------------------------------------- +// Storage-bound audit (v12) — responder commitment rotation +// --------------------------------------------------------------------------- + +/// Read the current LMDB key set, build + sign a fresh +/// `StorageCommitment`, and rotate it into `state` as the new `current`. +/// The prior `current` is demoted to `previous`; the prior `previous` is +/// dropped (per `ResponderCommitmentState::rotate`). +/// +/// For content-addressed chunks (Autonomi's chunk store), `address == +/// BLAKE3(content)`, so `bytes_hash := key` and we don't have to +/// re-read each chunk's bytes to compute the leaf hash. +/// +/// Skips (returns `Ok(())`) if the key set is empty — no commitment to +/// rotate. The auditor side handles "no commitment for this peer" by +/// falling back to the legacy plain-digest audit path. +async fn rebuild_and_rotate_commitment( + storage: &Arc, + identity: &Arc, + state: &Arc, + p2p: &Arc, + config: &Arc, +) -> Result<()> { + use saorsa_pqc::api::sig::{MlDsaSecretKey, MlDsaVariant}; + + let stored_keys = storage + .all_keys() + .await + .map_err(|e| Error::Storage(format!("commitment build: read keys: {e}")))?; + + // Commit only to keys we are still RESPONSIBLE for ("want-to-hold"), not + // everything currently on disk ("hold"). This is the half of the retention + // contract that lets out-of-range chunks age out: a key that has left our + // close group is excluded from the NEXT commitment, so within at most + // RETAINED_GOSSIPED_COMMITMENTS gossip rotations it falls out of the + // last-2-gossiped window, `ResponderCommitmentState::is_held` goes false, + // and the pruner (which until then vetoes its deletion) reclaims it. Without + // this filter the pruner's reprieve would keep re-committing stale keys + // forever (the rebuild reads all_keys, so a retained-on-disk key would be + // re-committed and re-gossiped every rotation — a permanent pin). + let storage_empty = stored_keys.is_empty(); + let self_id = *p2p.peer_id(); + let mut keys = Vec::with_capacity(stored_keys.len()); + for k in stored_keys { + if admission::is_responsible(&self_id, &k, p2p, config.close_group_size).await { + keys.push(k); + } + } + + if keys.is_empty() { + if storage_empty { + // Storage is genuinely empty — there is nothing to answer for, so + // drop the previously advertised commitment immediately. Keeping it + // would leave remote auditors pinning a hash we can never satisfy + // again (the bytes are gone). + if state.retained_slot_count() > 0 { + debug!("Commitment rotation: storage empty, clearing retained slots"); + state.clear_all(); + } + return Ok(()); + } + // Bytes are still on disk but no key is currently in range. We must NOT + // clear retention here: a peer may still be pinning a root we gossiped + // moments ago and could demand its bytes in a round-2 challenge, which + // we can still answer (the bytes are present). But we must STOP + // advertising the stale commitment: retire it so `current()` returns + // `None` and the gossip-emit sites stop re-emitting and re-stamping it. + // The retired slot then ages out by its gossip-answerability TTL while + // remaining answerable for in-flight pins until then. Once it ages out, + // `is_held` flips false and the pruner reclaims the now-uncommitted, + // out-of-range chunks. (Calling `age_out` alone would leave `current()` + // pointing at the stale root, which the gossip loop would keep + // re-stamping — pinning its keys forever.) + debug!( + "Commitment rotation: no responsible keys to commit to; retiring current commitment \ + (stays answerable until its gossip TTL lapses, bytes still on disk)" + ); + state.retire_current(); + return Ok(()); + } + + // Cap to MAX_COMMITMENT_KEY_COUNT for v12 (responder must not commit + // to more than the protocol limit; auditor would reject the + // commitment otherwise). + let cap = commitment::MAX_COMMITMENT_KEY_COUNT as usize; + if keys.len() > cap { + warn!( + "Commitment rotation: key set ({}) exceeds MAX_COMMITMENT_KEY_COUNT ({}); \ + truncating — investigate as this likely means a misconfiguration", + keys.len(), + cap + ); + } + + // INVARIANT: this module is only used with CONTENT-ADDRESSED chunks, + // where `key == BLAKE3(content)`, so `bytes_hash := key` and we skip a + // full chunk re-read per rotation. + // + // Consequence to be precise about: because the leaf is `(key, key)`, + // the Merkle root commits to the SET OF KEYS, not to the bytes. The + // commitment therefore binds "which keys I claim to hold"; it does NOT + // by itself prove byte possession. Byte possession is enforced by the + // audit-verify path, which recomputes `bytes_hash == BLAKE3(local_bytes)` + // and the per-key digest against the AUDITOR'S OWN local copy of the + // bytes — so a responder that holds the key list but dropped the bytes + // still fails (`missing bytes for committed key` / digest mismatch). + // This is sound ONLY while keys are content addresses. If this module + // is ever reused for non-content-addressed records (`bytes_hash != key`), + // the `(k, k)` shortcut would let a byte-less node forge a valid root and + // MUST be replaced with `(key, BLAKE3(bytes))` computed from real bytes. + let entries: Vec<_> = keys.into_iter().take(cap).map(|k| (k, k)).collect(); + + // No-op-rotation guard: compute just the Merkle root from `entries` + // and compare against the currently-advertised commitment's root. + // If they match, the key set is unchanged and a new rotation would + // only swap a randomized ML-DSA signature for a fresh one — same + // content, different commitment_hash. That invalidates every + // outstanding `recent_provers` credit on this node across the + // close group with no security benefit, breaking steady-state + // quorum liveness on large nodes that can't re-audit every key + // every rotation interval. Skip the rotation entirely when the + // tree is unchanged. + let candidate_tree = + commitment::MerkleTree::build(entries.iter().map(|(k, bh)| (*k, *bh)).collect::>()) + .map_err(|e| Error::Crypto(format!("commitment tree build: {e}")))?; + let candidate_root = candidate_tree.root(); + if let Some(current) = state.current() { + if current.commitment().root == candidate_root { + debug!( + "Commitment rotation: key set unchanged (root={}); skipping no-op re-sign", + hex::encode(candidate_root) + ); + // Even though we skip re-signing (to avoid invalidating holder + // credit), retention must still advance on the wall clock: a + // previously-gossiped commitment that holds a now-out-of-range key + // must be able to age out of the answerability window even when the + // committed key set is frozen here for many rotations. Without this, + // the no-op guard would pin a stale slot — and its key — forever. + state.age_out(); + return Ok(()); + } + } + + let sk_bytes = identity.secret_key_bytes().to_vec(); + let sk = MlDsaSecretKey::from_bytes(MlDsaVariant::MlDsa65, &sk_bytes) + .map_err(|e| Error::Crypto(format!("commitment build: load sk: {e}")))?; + let pk_bytes = identity.public_key().as_bytes().to_vec(); + let peer_id_bytes = *p2p.peer_id().as_bytes(); + + let built = commitment_state::BuiltCommitment::build(entries, &peer_id_bytes, &sk, &pk_bytes) + .map_err(|e| Error::Crypto(format!("commitment build: {e}")))?; + + let hash = hex::encode(built.hash()); + let key_count = built.commitment().key_count; + state.rotate(built); + info!("Storage commitment rotated: hash={hash} key_count={key_count}"); + Ok(()) +} + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::{ + adaptive_timeout_threshold, apply_audit_failure_credit_revocation, + audit_failure_clears_bootstrap_claim, audit_failure_revokes_holder_credit, + audit_launch_decision, config, cooldown_allows_audit, decide_audit_failure_action, + first_failed_key_label, median_timeout_strikes_excluding, plan_failed_audit, + record_audit_timeout_strike, timeout_strike_reaches_threshold, AuditFailureAction, + AUDIT_TIMEOUT_STRIKE_MAX, + }; + use crate::replication::recent_provers::RecentProvers; + use crate::replication::types::AuditFailureReason; + use saorsa_core::identity::PeerId; + use std::collections::HashMap; + use std::time::Duration; + use std::time::Instant; + + fn test_peer(b: u8) -> PeerId { + let mut bytes = [0u8; 32]; + bytes[0] = b; + PeerId::from_bytes(bytes) + } + + fn test_key(b: u8) -> crate::ant_protocol::XorName { + let mut k = [0u8; 32]; + k[0] = b; + k + } + + #[test] + fn audit_timeout_preserves_active_bootstrap_claim() { + assert!(!audit_failure_clears_bootstrap_claim( + &AuditFailureReason::Timeout + )); + } + + fn strike_peer(b: u8) -> PeerId { + let mut bytes = [0u8; 32]; + bytes[0] = b; + PeerId::from_bytes(bytes) + } + + // HELPER-LEVEL: counter arithmetic + threshold predicate. The reset is + // simulated by an in-test `strikes.remove`; the real reset path (the + // `Passed` arm) is covered at the glue level below. + #[test] + fn single_timeout_then_success_emits_no_failure_and_resets() { + let peer = strike_peer(1); + let mut strikes: HashMap = HashMap::new(); + let base = config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + let after_one = record_audit_timeout_strike(&mut strikes, &peer); + assert_eq!(after_one, 1); + assert!(!timeout_strike_reaches_threshold(after_one, base)); + strikes.remove(&peer); + assert!(!strikes.contains_key(&peer)); + } + + #[test] + fn consecutive_timeouts_cross_threshold_at_n() { + let peer = strike_peer(2); + let mut strikes: HashMap = HashMap::new(); + let n = config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + let mut last = 0; + for i in 1..=n { + last = record_audit_timeout_strike(&mut strikes, &peer); + if i < n { + assert!(!timeout_strike_reaches_threshold(last, n)); + } + } + assert!(timeout_strike_reaches_threshold(last, n)); + // The count keeps climbing past the base threshold (so it can also + // cross a higher *adaptive* threshold), but is bounded by the strike + // cap — no unbounded growth. + let mut c = last; + for _ in 0..200 { + c = record_audit_timeout_strike(&mut strikes, &peer); + } + assert_eq!( + c, + super::AUDIT_TIMEOUT_STRIKE_MAX, + "count saturates at the max cap" + ); + assert!(c > n, "count must be able to exceed the base threshold"); + } + + // ADR-0002 Network Resilience: adaptive timeout threshold. + + #[test] + fn median_timeout_strikes_basics() { + let target = strike_peer(99); + let mut strikes: HashMap = HashMap::new(); + // No other peers → 0 (healthy network, threshold == base). + assert_eq!(median_timeout_strikes_excluding(&strikes, &target), 0); + strikes.insert(strike_peer(1), 1); + strikes.insert(strike_peer(2), 3); + strikes.insert(strike_peer(3), 5); + // Sorted [1,3,5], lower-median index 1 → 3. + assert_eq!(median_timeout_strikes_excluding(&strikes, &target), 3); + } + + // ADVERSARIAL (ADR point e + sybil-inflation bound). Two invariants the + // existing suite leaves unpinned: + // 1. EVEN-count inputs must take the LOWER of the two middle values. The + // existing basics test only feeds an odd-length cohort, so an + // implementation that used `len/2` (upper median) would still pass it. + // Here [1,4] -> lower median 1 (not 4) and [2,4,6,8] -> 4 (not 6). + // 2. A sybil cohort pinned at the *strike cap* (the most an attacker could + // ever drive fabricated peers to) STILL cannot push the grace past + // MAX_ADAPTIVE_TIMEOUT_GRACE: the threshold saturates at base + max + // grace regardless of how high or how numerous the cohort is. + // FLIPS IF: median switches to the upper element on even input, or the + // grace clamp (`.min(MAX_ADAPTIVE_TIMEOUT_GRACE)`) is removed. + #[test] + fn even_count_takes_lower_median_and_sybil_cohort_cannot_exceed_grace_bound() { + let target = strike_peer(150); + + // Even count == 2: lower of [1, 4] is 1. + let mut two: HashMap = HashMap::new(); + two.insert(strike_peer(1), 1); + two.insert(strike_peer(2), 4); + assert_eq!( + median_timeout_strikes_excluding(&two, &target), + 1, + "even-count median must take the LOWER middle value (1), not the upper (4)" + ); + + // Even count == 4: sorted [2,4,6,8], lower median index (4-1)/2 = 1 → 4. + let mut four: HashMap = HashMap::new(); + for (i, v) in (10u8..).zip([2u32, 4, 6, 8]) { + four.insert(strike_peer(i), v); + } + assert_eq!( + median_timeout_strikes_excluding(&four, &target), + 4, + "even-count median must be the lower middle (4), not the upper (6)" + ); + + // Sybil cohort pinned at the strike CAP — the strongest inflation an + // attacker could mount — must not lift the threshold past base + max + // grace. Try several cohort sizes (odd and even) to be sure. + for cohort in [2u8, 5, 8, 20] { + let mut strikes: HashMap = HashMap::new(); + for i in 0..cohort { + strikes.insert(strike_peer(50 + i), super::AUDIT_TIMEOUT_STRIKE_MAX); + } + let threshold = adaptive_timeout_threshold(&strikes, &target); + assert_eq!( + threshold, + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + super::MAX_ADAPTIVE_TIMEOUT_GRACE, + "a sybil cohort at the strike cap (size {cohort}) must saturate the grace at \ + the bound, never exceed it" + ); + } + + // And even at the bounded-but-inflated threshold, a genuinely + // non-responsive target can still cross it (cap > max reachable + // threshold), so the bound never shields a bad node forever. + let mut strikes: HashMap = HashMap::new(); + for i in 0..8u8 { + strikes.insert(strike_peer(80 + i), super::AUDIT_TIMEOUT_STRIKE_MAX); + } + let threshold = adaptive_timeout_threshold(&strikes, &target); + let mut c = 0; + for _ in 0..(threshold + 5) { + c = record_audit_timeout_strike(&mut strikes, &target); + } + assert!( + timeout_strike_reaches_threshold(c, threshold), + "target must still cross the bounded inflated threshold ({c} vs {threshold})" + ); + } + + #[test] + fn lone_timing_out_peer_does_not_inflate_its_own_grace() { + // The peer under judgement is excluded from the median, so a single bad + // peer (the common case) is judged against the base threshold and caught + // — it cannot raise its own bar as its strike count climbs. + let bad = strike_peer(7); + let mut strikes: HashMap = HashMap::new(); + strikes.insert(bad, 5); // its own large count must not count + assert_eq!( + adaptive_timeout_threshold(&strikes, &bad), + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + } + + #[test] + fn widespread_timeouts_widen_the_grace() { + // Genuine disruption: many OTHER honest peers carry timeout strikes. The + // median rises, so the threshold for any given peer widens beyond the + // base — the audit system does not pile onto a struggling network. + let target = strike_peer(100); + let mut strikes: HashMap = HashMap::new(); + for i in 0..9u8 { + strikes.insert(strike_peer(i), 4); + } + assert_eq!( + adaptive_timeout_threshold(&strikes, &target), + 4 + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + assert!( + adaptive_timeout_threshold(&strikes, &target) > config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + } + + #[test] + fn adaptive_grace_only_responds_to_timeouts_not_deterministic_failures() { + // The strike map is fed ONLY by timeouts (plan_failed_audit records a + // strike for Timeout and never for confirmed failures). So a flood of + // deterministic failures cannot inflate the median to buy grace. + let target = strike_peer(101); + let mut strikes: HashMap = HashMap::new(); + // Many confirmed (non-timeout) failures: these must NOT touch the map. + for i in 0..9u8 { + let action = plan_failed_audit( + &AuditFailureReason::DigestMismatch, + &mut strikes, + &strike_peer(i), + ); + assert_eq!(action, AuditFailureAction::ConfirmedPenalize); + } + assert!( + strikes.is_empty(), + "deterministic failures must not record strikes" + ); + // Threshold stays at the base — an attacker cannot widen grace by + // failing audits on purpose. + assert_eq!( + adaptive_timeout_threshold(&strikes, &target), + config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + ); + } + + // ADR-0002: "occasional surprise exams, keeps load low" — the per-peer + // cooldown must collapse a gossip flood into at most one audit per window. + + #[test] + fn gossip_flood_yields_at_most_one_audit_per_cooldown_window() { + let peer = strike_peer(1); + let mut map: HashMap = HashMap::new(); + let t0 = Instant::now(); + // First gossip in the window passes; a burst of further gossips at the + // same instant are all suppressed. + assert!(cooldown_allows_audit(&mut map, &peer, t0)); + let mut passed = 1; + for _ in 0..100 { + if cooldown_allows_audit(&mut map, &peer, t0) { + passed += 1; + } + } + assert_eq!( + passed, 1, + "a flood at one instant must trigger exactly one audit" + ); + } + + // ADR-0002 ordering invariant: `maybe_trigger_gossip_audit` stamps the + // per-peer cooldown BEFORE the probability lottery, so a LOSING ticket still + // consumes the window. This is the property the isolated cooldown tests above + // cannot see: they never sample the lottery, so a regression that reordered + // the gates (sample probability first, only stamp the cooldown on a win) + // would still pass them while breaking flood-resistance: a flood would then + // re-roll the lottery on EVERY message until one won, multiplying audits. + // + // We model the exact production gate order (cooldown-then-lottery) with a + // lottery driven by a fixed outcome instead of `gen_bool(..)`. The first + // message LOSES the lottery; the remaining flood messages all WIN. With the + // production order, the losing first ticket burns the window and every later + // winner in the same window is blocked, so there are 0 audits this window. If + // the gates were flipped, the second message's winning ticket would slip + // through. The window only reopens after the cooldown elapses. + // + // FLIPS IF: the lottery is sampled before `cooldown_allows_audit` (a losing + // ticket no longer consumes the window), re-enabling a flood-amplified audit + // storm. + #[test] + fn losing_lottery_still_consumes_cooldown_window() { + // Faithful re-implementation of the two gates in + // `maybe_trigger_gossip_audit`, with the lottery outcome made + // deterministic instead of `rand::thread_rng().gen_bool(..)`. + // Calls the SHIPPED `audit_launch_decision` (the same function + // `maybe_trigger_gossip_audit` uses), so a reorder of the two gates in + // production fails this test — not a local reimplementation. + let peer = strike_peer(3); + let mut map: HashMap = HashMap::new(); + let t0 = Instant::now(); + + // First flooded message at t0 LOSES the lottery, but the cooldown is + // stamped BEFORE the lottery is consulted, so the window is now consumed. + assert!( + !audit_launch_decision(&mut map, &peer, t0, false), + "a losing ticket launches no audit" + ); + + // 99 more flooded messages at the same instant would all WIN the lottery, + // yet every one must be blocked by the cooldown the loser already stamped. + // (If production sampled the lottery FIRST, these would each get a fresh + // roll and audits would multiply — this assertion catches that reorder.) + let mut audits = 0; + for _ in 0..99 { + if audit_launch_decision(&mut map, &peer, t0, true) { + audits += 1; + } + } + assert_eq!( + audits, 0, + "a losing first ticket must consume the window so no later flooded \ + message in the same window can audit" + ); + + // The window only reopens after the cooldown elapses; the next winning + // ticket then launches exactly one audit. + let after = t0 + Duration::from_secs(config::AUDIT_ON_GOSSIP_COOLDOWN_SECS + 1); + assert!( + audit_launch_decision(&mut map, &peer, after, true), + "after the cooldown a winning ticket audits again" + ); + } + + #[test] + fn cooldown_lets_audit_through_after_the_window() { + let peer = strike_peer(2); + let mut map: HashMap = HashMap::new(); + let t0 = Instant::now(); + assert!(cooldown_allows_audit(&mut map, &peer, t0)); + // Within the window: suppressed. + let within = t0 + Duration::from_secs(config::AUDIT_ON_GOSSIP_COOLDOWN_SECS - 1); + assert!(!cooldown_allows_audit(&mut map, &peer, within)); + // Past the window: allowed again. + let after = t0 + Duration::from_secs(config::AUDIT_ON_GOSSIP_COOLDOWN_SECS + 1); + assert!(cooldown_allows_audit(&mut map, &peer, after)); + } + + #[test] + fn cooldown_is_per_peer_independent() { + let mut map: HashMap = HashMap::new(); + let t0 = Instant::now(); + // Different peers each get their own first-audit pass at the same instant. + for i in 0..20u8 { + assert!( + cooldown_allows_audit(&mut map, &strike_peer(i), t0), + "peer {i} should be auditable independently" + ); + } + } + + #[test] + fn inflated_adaptive_threshold_is_still_reachable_and_bounded() { + // When the median lifts the threshold above the base, a genuinely + // non-responsive peer's strike count must still be able to + // reach it (the count is no longer capped at the base). And the grace + // widening itself is bounded so it can't shield a bad node forever. + let target = strike_peer(200); + let mut strikes: HashMap = HashMap::new(); + // A cohort of other peers each at a high strike count. + for i in 0..9u8 { + strikes.insert(strike_peer(i), 10); + } + let threshold = adaptive_timeout_threshold(&strikes, &target); + // Grace is capped, so the threshold cannot exceed base + max grace. + assert!( + threshold <= config::AUDIT_TIMEOUT_STRIKE_THRESHOLD + super::MAX_ADAPTIVE_TIMEOUT_GRACE + ); + assert!(threshold > config::AUDIT_TIMEOUT_STRIKE_THRESHOLD); + // The target peer can accumulate strikes past that inflated threshold. + let mut c = 0; + for _ in 0..threshold + 5 { + c = record_audit_timeout_strike(&mut strikes, &target); + } + assert!( + timeout_strike_reaches_threshold(c, threshold), + "a persistent peer must be able to cross the inflated threshold ({c} vs {threshold})" + ); + } + + #[test] + fn audit_on_gossip_constants_match_adr() { + // Tripwire on the ADR-locked tunables. The spot-check count sits at the + // top of the auditor's 3..=5 band (the auditor clamps to that band, so + // values above 5 would silently never be requested). + assert_eq!(config::AUDIT_SPOTCHECK_COUNT, 5); + assert!((config::AUDIT_ON_GOSSIP_PROBABILITY - 0.2).abs() < f64::EPSILON); + assert_eq!(config::AUDIT_ON_GOSSIP_COOLDOWN_SECS, 30 * 60); + } + + // (d) A confirmed storage-integrity failure penalizes immediately and + // revokes credit; it is not a timeout. + #[test] + fn digest_mismatch_is_not_a_timeout_and_penalizes_immediately() { + assert!(audit_failure_clears_bootstrap_claim( + &AuditFailureReason::DigestMismatch + )); + assert!(audit_failure_revokes_holder_credit( + &AuditFailureReason::DigestMismatch + )); + } + + // E2E (pure decision): an honest peer that times out once, recovers, + // repeatedly, never reaches a penalty because each success resets strikes. + // FLIPS IF: the strike threshold is removed or success stops resetting. + #[test] + fn e2e_honest_intermittent_timeouts_never_penalized() { + let peer = strike_peer(10); + let base = config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + let mut strikes: HashMap = HashMap::new(); + for _ in 0..10 { + let after = record_audit_timeout_strike(&mut strikes, &peer); + assert_eq!( + decide_audit_failure_action(&AuditFailureReason::Timeout, after, base), + AuditFailureAction::TimeoutGrace + ); + strikes.remove(&peer); + } + assert!(!strikes.contains_key(&peer)); + } + + // E2E: a peer that times out on EVERY audit (never reset) crosses the + // threshold and is penalized — the deterrent against non-storing peers. + // FLIPS IF: per-challenge window widened so it answers in time, or strikes + // reset without a success. + #[test] + fn e2e_persistent_timeouts_get_penalized() { + let peer = strike_peer(11); + let mut strikes: HashMap = HashMap::new(); + let threshold = config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + let mut penalized_at = None; + for tick in 1..=(threshold + 2) { + let after = record_audit_timeout_strike(&mut strikes, &peer); + if decide_audit_failure_action(&AuditFailureReason::Timeout, after, threshold) + == AuditFailureAction::TimeoutPenalize + && penalized_at.is_none() + { + penalized_at = Some(tick); + } + } + assert_eq!(penalized_at, Some(threshold)); + } + + // Glue: a Timeout through the real plan_failed_audit MUST record a strike on + // the map AND penalize once enough accumulate. + // FLIPS IF: the handler stops feeding Timeout through the strike counter + // (e.g. strikes_after hard-coded to 0). (Mutation-verified.) + #[test] + fn e2e_glue_timeout_records_strike_and_penalizes_at_threshold() { + let peer = strike_peer(20); + let mut strikes: HashMap = HashMap::new(); + let threshold = config::AUDIT_TIMEOUT_STRIKE_THRESHOLD; + let mut action = AuditFailureAction::TimeoutGrace; + for tick in 1..=threshold { + action = plan_failed_audit(&AuditFailureReason::Timeout, &mut strikes, &peer); + assert_eq!(strikes.get(&peer).copied(), Some(tick)); + } + assert_eq!(action, AuditFailureAction::TimeoutPenalize); + } + + // Glue: a confirmed failure through plan_failed_audit must NOT touch the + // strike map and must return ConfirmedPenalize. + #[test] + fn e2e_glue_confirmed_failure_leaves_strike_map_untouched() { + let peer = strike_peer(21); + let mut strikes: HashMap = HashMap::new(); + for reason in [ + AuditFailureReason::DigestMismatch, + AuditFailureReason::KeyAbsent, + AuditFailureReason::Rejected, + AuditFailureReason::MalformedResponse, + ] { + assert_eq!( + plan_failed_audit(&reason, &mut strikes, &peer), + AuditFailureAction::ConfirmedPenalize + ); + } + assert!(strikes.is_empty()); + } + + // ADR-0002 "Accounting and False Positives", adversarial: a DETERMINISTIC + // failure is acted on the FIRST time it occurs, "regardless of network + // conditions". Here the strike map is pre-loaded with many *other* peers + // timing out, which inflates the adaptive timeout grace to its cap — the + // most forgiving the network ever gets. Under that maximally-relaxed + // window: + // - a brand-new peer's FIRST deterministic failure (DigestMismatch / + // Rejected / MalformedResponse) STILL returns ConfirmedPenalize, never + // a grace lane, and never touches the strike map; while + // - that same peer's FIRST timeout is only TimeoutGrace. + // This proves the inflated grace is the timeout-only lane and can NEVER be + // weaponized to buy a deterministic failure even one round of delay. + // FLIPS IF: deterministic failures start consulting the strike threshold, + // or ConfirmedPenalize is collapsed into a timeout action. + #[test] + fn deterministic_failure_penalizes_first_time_under_inflated_grace() { + let mut strikes: HashMap = HashMap::new(); + // Saturate the adaptive grace: many other peers each carrying a high + // consecutive-timeout count, so the median (and thus the grace) is + // pushed to its MAX cap for any newly-judged peer. + for b in 100..150u8 { + let other = strike_peer(b); + for _ in 0..AUDIT_TIMEOUT_STRIKE_MAX { + record_audit_timeout_strike(&mut strikes, &other); + } + } + let victim = strike_peer(7); + // Sanity: the grace seen by the victim is genuinely inflated above base. + let inflated = adaptive_timeout_threshold(&strikes, &victim); + assert!( + inflated > config::AUDIT_TIMEOUT_STRIKE_THRESHOLD, + "test precondition: grace must be inflated, got {inflated}" + ); + + // First deterministic failure of each kind -> ConfirmedPenalize on + // occurrence #1, and the victim is never inserted into the strike map. + for reason in [ + AuditFailureReason::DigestMismatch, + AuditFailureReason::Rejected, + AuditFailureReason::MalformedResponse, + ] { + let action = plan_failed_audit(&reason, &mut strikes, &victim); + assert_eq!( + action, + AuditFailureAction::ConfirmedPenalize, + "{reason:?} must penalize on the first occurrence regardless of grace" + ); + assert_ne!( + action, + AuditFailureAction::TimeoutPenalize, + "a deterministic failure must NOT be routed through the (eviction-gated) \ + timeout-penalize lane" + ); + assert!( + !strikes.contains_key(&victim), + "deterministic failure must not touch the timeout strike map" + ); + // And it always revokes holder credit / clears the claim. + assert!(audit_failure_revokes_holder_credit(&reason)); + assert!(audit_failure_clears_bootstrap_claim(&reason)); + } + + // The SAME victim's first timeout, under the same inflated grace, is + // only TimeoutGrace (no penalty, no revocation, claim retained). + let timeout_action = plan_failed_audit(&AuditFailureReason::Timeout, &mut strikes, &victim); + assert_eq!(timeout_action, AuditFailureAction::TimeoutGrace); + assert_eq!(strikes.get(&victim).copied(), Some(1)); + assert!(!audit_failure_revokes_holder_credit( + &AuditFailureReason::Timeout + )); + assert!(!audit_failure_clears_bootstrap_claim( + &AuditFailureReason::Timeout + )); + } + + /// The exact decision the `Failed` arm of `handle_subtree_audit_result` + /// uses: confirmed failures revoke credit, `Timeout` does not. + #[test] + fn confirmed_failures_revoke_credit_timeout_does_not() { + for reason in [ + AuditFailureReason::MalformedResponse, + AuditFailureReason::DigestMismatch, + AuditFailureReason::KeyAbsent, + AuditFailureReason::Rejected, + ] { + assert!( + audit_failure_revokes_holder_credit(&reason), + "confirmed failure {reason:?} must revoke holder credit" + ); + } + assert!( + !audit_failure_revokes_holder_credit(&AuditFailureReason::Timeout), + "Timeout must NOT revoke credit (single dropped packet != storage loss)" + ); + } + + /// Wiring test for the security fix: the helper the handler calls + /// actually strips a credited peer on a confirmed failure + /// (`DigestMismatch`), and actually RETAINS credit on `Timeout`. + /// Records genuine credit first so neither assertion is vacuous; + /// this fails if `forget_peer` stops being called, or if the + /// `Timeout` exclusion is dropped (both verified by mutation). + #[test] + fn apply_revocation_strips_on_digest_mismatch_retains_on_timeout() { + let peer = test_peer(0xAB); + let key = test_key(1); + let hash = [0xCD; 32]; + + // Confirmed failure -> credit revoked. + let mut provers = RecentProvers::new(); + provers.record_proof(key, peer, hash, Instant::now()); + assert!( + provers.is_credited_holder(&key, &peer, &hash), + "precondition: peer credited before failure" + ); + apply_audit_failure_credit_revocation( + &mut provers, + &peer, + &AuditFailureReason::DigestMismatch, + ); + assert!( + !provers.is_credited_holder(&key, &peer, &hash), + "DigestMismatch must strip the peer's holder credit" + ); + + // Timeout -> credit retained. + let mut provers_timeout = RecentProvers::new(); + provers_timeout.record_proof(key, peer, hash, Instant::now()); + apply_audit_failure_credit_revocation( + &mut provers_timeout, + &peer, + &AuditFailureReason::Timeout, + ); + assert!( + provers_timeout.is_credited_holder(&key, &peer, &hash), + "Timeout must retain holder credit (deliberate liveness cushion)" + ); } #[test] diff --git a/src/replication/neighbor_sync.rs b/src/replication/neighbor_sync.rs index 897d41ad..b84dab6a 100644 --- a/src/replication/neighbor_sync.rs +++ b/src/replication/neighbor_sync.rs @@ -182,11 +182,23 @@ pub async fn sync_with_peer( config: &ReplicationConfig, is_bootstrapping: bool, ) -> Option { - sync_with_peer_with_outcome(peer, p2p_node, storage, paid_list, config, is_bootstrapping) - .await - .map(|outcome| outcome.response) + sync_with_peer_with_outcome( + peer, + p2p_node, + storage, + paid_list, + config, + is_bootstrapping, + None, + ) + .await + .map(|outcome| outcome.response) } +/// `commitment`: sender's current commitment to piggyback on the request. +/// `None` if the responder hasn't rotated one yet (e.g. fresh boot, +/// empty storage) — receiver falls back to legacy path. +#[allow(clippy::too_many_arguments)] pub(crate) async fn sync_with_peer_with_outcome( peer: &PeerId, p2p_node: &Arc, @@ -194,6 +206,7 @@ pub(crate) async fn sync_with_peer_with_outcome( paid_list: &Arc, config: &ReplicationConfig, is_bootstrapping: bool, + commitment: Option, ) -> Option { // Build peer-targeted hint sets (Rule 7). let sent_replica_hints = build_replica_hints_for_peer_with_close_groups( @@ -215,6 +228,7 @@ pub(crate) async fn sync_with_peer_with_outcome( replica_hints, paid_hints, bootstrapping: is_bootstrapping, + commitment, }; let request_id = rand::thread_rng().gen::(); let msg = ReplicationMessage { @@ -335,11 +349,13 @@ pub async fn handle_sync_request( paid_list, config, is_bootstrapping, + None, ) .await; (response, sender_in_rt) } +#[allow(clippy::too_many_arguments)] pub(crate) async fn handle_sync_request_with_proofs( sender: &PeerId, _request: &NeighborSyncRequest, @@ -348,6 +364,7 @@ pub(crate) async fn handle_sync_request_with_proofs( paid_list: &Arc, config: &ReplicationConfig, is_bootstrapping: bool, + my_commitment: Option, ) -> (NeighborSyncResponse, Vec, bool) { let sender_in_rt = p2p_node.dht_manager().is_in_routing_table(sender).await; @@ -376,6 +393,7 @@ pub(crate) async fn handle_sync_request_with_proofs( paid_hints, bootstrapping: is_bootstrapping, rejected_keys: Vec::new(), + commitment: my_commitment, }; // Rule 4-6: accept inbound hints only if sender is in LocalRT. @@ -977,6 +995,7 @@ mod tests { paid_hints: outbound_paid_hints.clone(), bootstrapping: false, rejected_keys: Vec::new(), + commitment: None, }; // Inbound hints from the sender (would be in the request). diff --git a/src/replication/protocol.rs b/src/replication/protocol.rs index a5151a33..6ef424fd 100644 --- a/src/replication/protocol.rs +++ b/src/replication/protocol.rs @@ -109,11 +109,22 @@ pub enum ReplicationMessageBody { /// Response with the record data. FetchResponse(FetchResponse), - // === Audit (Section 15) === - /// Storage audit challenge. + // === Responsible-chunk audit (per-key digests) === + /// Per-key audit challenge: used by the responsible-chunk audit and the + /// prune-confirmation path. AuditChallenge(AuditChallenge), - /// Response to audit challenge. + /// Response to a per-key audit challenge. AuditResponse(AuditResponse), + + // === Storage-bound subtree audit (ADR-0002) === + /// Gossip-triggered contiguous-subtree storage audit challenge (round 1). + SubtreeAuditChallenge(SubtreeAuditChallenge), + /// Response to a contiguous-subtree storage audit challenge (round 1). + SubtreeAuditResponse(SubtreeAuditResponse), + /// Surprise byte challenge for the spot-checked leaves (round 2). + SubtreeByteChallenge(SubtreeByteChallenge), + /// Response carrying the requested chunks' original bytes (round 2). + SubtreeByteResponse(SubtreeByteResponse), } // --------------------------------------------------------------------------- @@ -177,6 +188,14 @@ pub struct NeighborSyncRequest { pub paid_hints: Vec, /// Whether sender is currently bootstrapping. pub bootstrapping: bool, + /// Sender's signed storage commitment (optional, see + /// [`crate::replication::commitment`]). `None` from old peers; from + /// new peers this carries the Merkle-root commitment over the + /// sender's claimed keys. Receivers that recognize it store it as + /// the per-peer "last known commitment" used to pin commitment-bound + /// audits. + #[serde(default)] + pub commitment: Option, } /// Neighbor sync response carrying own hint sets. @@ -190,6 +209,10 @@ pub struct NeighborSyncResponse { pub bootstrapping: bool, /// Keys that receiver rejected (optional feedback to sender). pub rejected_keys: Vec, + /// Receiver's signed storage commitment (optional, see + /// [`NeighborSyncRequest::commitment`]). + #[serde(default)] + pub commitment: Option, } // --------------------------------------------------------------------------- @@ -271,11 +294,14 @@ pub enum FetchResponse { // Audit Messages // --------------------------------------------------------------------------- -/// Storage audit challenge (Section 15). +/// Per-key audit challenge. /// /// The challenger picks a random nonce and a set of keys the challenged peer -/// should hold, then sends this challenge. The challenged peer must prove -/// storage by returning per-key BLAKE3 digests. +/// should hold, then sends this challenge. The challenged peer proves storage +/// by returning per-key BLAKE3 digests. Used by the responsible-chunk audit +/// (audit #2: a node samples keys a close peer should hold) and by the +/// prune-confirmation path (a node checks a peer still holds a key before +/// pruning its own copy). #[derive(Debug, Clone, Serialize, Deserialize)] pub struct AuditChallenge { /// Unique challenge identifier. @@ -288,7 +314,7 @@ pub struct AuditChallenge { pub keys: Vec, } -/// Response to audit challenge. +/// Response to a per-key audit challenge. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum AuditResponse { /// Per-key digests proving storage. @@ -318,6 +344,164 @@ pub enum AuditResponse { }, } +/// Gossip-triggered contiguous-subtree storage audit challenge (ADR-0002). +/// +/// The auditor pins the commitment a peer just gossiped and sends a fresh +/// random nonce. The nonce alone deterministically selects one contiguous +/// subtree of the peer's committed Merkle tree (see +/// [`crate::replication::subtree::select_subtree_path`]); the auditor does +/// **not** name keys. The responder must reply with a +/// [`SubtreeAuditResponse::Proof`] for that selected subtree against the pinned +/// commitment, or a [`SubtreeAuditResponse::Rejected`] if it genuinely cannot +/// (for a recently gossiped pinned commitment a rejection is a confirmed +/// failure, since the responder retains its last two gossiped commitments). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SubtreeAuditChallenge { + /// Unique challenge identifier. + pub challenge_id: u64, + /// Random nonce. Selects the subtree AND freshens each leaf's possession + /// hash, so a stored answer cannot be replayed. + pub nonce: [u8; 32], + /// Challenged peer ID. Bound into each leaf's possession hash. + pub challenged_peer_id: [u8; 32], + /// The auditor's pin: the [`crate::replication::commitment::commitment_hash`] + /// of the commitment the peer just gossiped. The response's commitment must + /// hash to exactly this value. + pub expected_commitment_hash: [u8; 32], +} + +/// Response to a contiguous-subtree storage audit challenge (ADR-0002). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum SubtreeAuditResponse { + /// The single-contiguous-subtree proof. + /// + /// Carries the responder's signed commitment (so the auditor re-derives + /// `key_count` and confirms the pin and signature) and the + /// nonce-selected subtree expanded to its leaves plus the sibling + /// cut-hashes on the path to the root. This is **round 1** of the + /// two-round audit. The auditor: + /// 1. confirms `commitment_hash(commitment) == expected_commitment_hash` + /// and the signature is valid; + /// 2. re-derives the selected subtree from `(nonce, key_count)`, rebuilds + /// the root from the proof, and requires it to equal the commitment + /// root (structure). + /// + /// The leaves carry only hashes (`bytes_hash`, `nonced_hash`), so this round + /// proves the tree SHAPE is committed — not that the bytes are still held. + /// Real possession is proven in **round 2**: the auditor picks a few of the + /// just-verified leaves and sends a [`SubtreeByteChallenge`] requesting their + /// original chunk bytes FROM the responder (see that type). + Proof { + /// The challenge this response answers. + challenge_id: u64, + /// The signed commitment whose root the proof is against. + commitment: crate::replication::commitment::StorageCommitment, + /// The nonce-selected contiguous subtree proof. + proof: crate::replication::subtree::SubtreeProof, + }, + /// Peer is still bootstrapping (not ready for audit). + Bootstrapping { + /// The challenge this response answers. + challenge_id: u64, + }, + /// Challenge rejected. The `reason` is for logging only; for a recently + /// gossiped pinned commitment a rejection is a confirmed failure (the + /// responder retains its last two gossiped commitments and must be able to + /// answer either). + Rejected { + /// The challenge this response answers. + challenge_id: u64, + /// Human-readable rejection reason. + reason: String, + }, +} + +/// Round 2 of the storage audit (ADR-0002): the **surprise byte challenge**. +/// +/// After the auditor has structurally verified a [`SubtreeAuditResponse::Proof`] +/// it picks a small, nonce-derived random sample of that subtree's just-proven +/// leaves (the responder cannot predict which) and asks the responder to return +/// the ORIGINAL chunk bytes for exactly those keys. The auditor then checks each +/// returned chunk against the committed leaf: +/// - `BLAKE3(bytes) == leaf.bytes_hash` (the chunk's content address), AND +/// - `compute_audit_digest(nonce, peer, key, bytes) == leaf.nonced_hash`. +/// +/// This makes possession non-delegable to the auditor: the auditor needs to +/// hold NONE of the responder's chunks. A responder that committed to a chunk it +/// no longer holds cannot fabricate bytes that hash to the committed address (a +/// preimage break), so it is caught regardless of who audits it. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SubtreeByteChallenge { + /// The same `challenge_id` as the round-1 [`SubtreeAuditChallenge`], so the + /// responder/auditor correlate the two rounds. + pub challenge_id: u64, + /// The same nonce as round 1 — needed for the freshness (`nonced_hash`) + /// check and to bind these bytes to this audit. + pub nonce: [u8; 32], + /// The challenged peer ID (bound into each leaf's possession hash). + pub challenged_peer_id: [u8; 32], + /// The pinned commitment hash from round 1, so the responder resolves the + /// SAME tree it just proved and serves bytes only for keys it committed to. + pub expected_commitment_hash: [u8; 32], + /// The exact keys whose original bytes the responder must return. These are + /// the auditor's nonce-derived spot-check sample of the round-1 subtree. + pub keys: Vec, +} + +/// One requested chunk in a [`SubtreeByteResponse`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum SubtreeByteItem { + /// The responder holds this committed key and returns its original bytes. + Present { + /// The requested key. + key: XorName, + /// The original chunk bytes (the auditor re-hashes to verify). + bytes: Vec, + }, + /// The responder committed to this key but cannot serve its bytes. This is a + /// PROVABLE cheat (it published a commitment over a chunk it does not hold), + /// so the auditor counts it as a confirmed failure — NOT a graced timeout. + /// Distinguishing this explicit signal from silence is what separates a + /// deleter (instant fail) from a dropped packet (timeout). + Absent { + /// The committed key the responder could not serve. + key: XorName, + }, +} + +/// Response to a [`SubtreeByteChallenge`] (round 2). One item per requested key, +/// in the requested order. +/// +/// Sizing rule: a challenge carries at most +/// [`MAX_BYTE_CHALLENGE_KEYS`](super::config::MAX_BYTE_CHALLENGE_KEYS) keys — +/// the auditor batches its sample, the responder rejects larger requests — so +/// the WORST-CASE `Items` response (every chunk at `MAX_CHUNK_SIZE`) always +/// encodes under [`MAX_REPLICATION_MESSAGE_SIZE`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum SubtreeByteResponse { + /// The responder's per-key answers (bytes or an explicit absent signal). + Items { + /// The challenge this response answers. + challenge_id: u64, + /// One entry per requested key. + items: Vec, + }, + /// Peer is still bootstrapping (should not happen mid-audit, but handled). + Bootstrapping { + /// The challenge this response answers. + challenge_id: u64, + }, + /// The responder rejects the byte challenge outright (e.g. it no longer + /// retains the pinned commitment). For a recently gossiped commitment the + /// auditor treats this as a confirmed failure, like round 1. + Rejected { + /// The challenge this response answers. + challenge_id: u64, + /// Human-readable rejection reason. + reason: String, + }, +} + // --------------------------------------------------------------------------- // Audit digest helper // --------------------------------------------------------------------------- @@ -392,6 +576,33 @@ impl std::error::Error for ReplicationProtocolError {} mod tests { use super::*; + // === Round-2 byte response sizing === + + #[test] + fn max_batch_worst_case_byte_response_fits_wire_cap() { + // The auditor batches its round-2 sample to MAX_BYTE_CHALLENGE_KEYS per + // challenge precisely so this worst case — every requested chunk at + // MAX_CHUNK_SIZE — still encodes. If this fails, honest responders + // would hit encode errors and be penalized as timeouts. + let items: Vec = (0..crate::replication::config::MAX_BYTE_CHALLENGE_KEYS) + .map(|i| SubtreeByteItem::Present { + key: [u8::try_from(i).unwrap_or(u8::MAX); 32], + bytes: vec![0xAB; crate::ant_protocol::MAX_CHUNK_SIZE], + }) + .collect(); + let msg = ReplicationMessage { + request_id: 7, + body: ReplicationMessageBody::SubtreeByteResponse(SubtreeByteResponse::Items { + challenge_id: 7, + items, + }), + }; + let encoded = msg + .encode() + .expect("worst-case max-batch byte response must fit the wire cap"); + assert!(encoded.len() <= MAX_REPLICATION_MESSAGE_SIZE); + } + // === Fresh Replication roundtrip === #[test] @@ -490,6 +701,110 @@ mod tests { // === Neighbor Sync roundtrips === + // -- backwards compat across the wire-type extension -------------------- + + /// Backwards-compat: an old peer that has the v0 layout of + /// `NeighborSyncRequest` (no `commitment` field) can still decode a + /// message encoded by a new peer that emits `commitment: None`. This + /// is the realistic mixed-version case during rollout: new peers + /// gossip with the field; old peers must not crash. + /// + /// The check works because postcard's [`from_bytes`] is lenient on + /// trailing bytes — the old decoder reads what it knows about and + /// stops, the new fields are silently ignored. This test pins that + /// invariant so any future codec/library swap that breaks it is + /// caught immediately. + #[test] + fn old_decoder_tolerates_new_neighbor_sync_request() { + use serde::Deserialize; + #[derive(Deserialize)] + struct OldNeighborSyncRequest { + #[allow(dead_code)] + pub replica_hints: Vec, + #[allow(dead_code)] + pub paid_hints: Vec, + #[allow(dead_code)] + pub bootstrapping: bool, + } + + let new_req = NeighborSyncRequest { + replica_hints: vec![[0x01; 32], [0x02; 32]], + paid_hints: vec![[0x03; 32]], + bootstrapping: true, + commitment: None, + }; + let encoded = postcard::to_stdvec(&new_req).expect("encode"); + let old_decoded: OldNeighborSyncRequest = + postcard::from_bytes(&encoded).expect("old decoder accepts"); + // Field-by-field check would fail if old peer misaligned on the + // length prefix — passing decode is the structural check. + assert_eq!(old_decoded.replica_hints.len(), 2); + assert_eq!(old_decoded.paid_hints.len(), 1); + assert!(old_decoded.bootstrapping); + } + + /// Same property for `NeighborSyncResponse`. + #[test] + fn old_decoder_tolerates_new_neighbor_sync_response() { + use serde::Deserialize; + #[derive(Deserialize)] + struct OldNeighborSyncResponse { + #[allow(dead_code)] + pub replica_hints: Vec, + #[allow(dead_code)] + pub paid_hints: Vec, + #[allow(dead_code)] + pub bootstrapping: bool, + #[allow(dead_code)] + pub rejected_keys: Vec, + } + + let new_resp = NeighborSyncResponse { + replica_hints: vec![[0x04; 32]], + paid_hints: vec![], + bootstrapping: false, + rejected_keys: vec![[0x05; 32]], + commitment: None, + }; + let encoded = postcard::to_stdvec(&new_resp).expect("encode"); + let old_decoded: OldNeighborSyncResponse = + postcard::from_bytes(&encoded).expect("old decoder accepts"); + assert_eq!(old_decoded.replica_hints.len(), 1); + assert_eq!(old_decoded.rejected_keys.len(), 1); + } + + /// Roundtrip: a new peer can decode its own message including the + /// commitment field. Catches accidental serde annotation breakage + /// (e.g. forgetting `#[serde(default)]` on the new field). + #[test] + fn new_peer_roundtrips_with_commitment_some() { + use crate::replication::commitment::{sign_commitment, StorageCommitment}; + use saorsa_pqc::api::sig::ml_dsa_65; + + let (pk, sk) = ml_dsa_65().generate_keypair().expect("keygen"); + let root = [0x7Fu8; 32]; + let sender = [0xCCu8; 32]; + let pk_bytes = pk.to_bytes(); + let sig = sign_commitment(&sk, &root, 3, &sender, &pk_bytes).expect("sign"); + let commitment = StorageCommitment { + root, + key_count: 3, + sender_peer_id: sender, + sender_public_key: pk_bytes, + signature: sig, + }; + + let req = NeighborSyncRequest { + replica_hints: vec![[0x01; 32]], + paid_hints: vec![], + bootstrapping: false, + commitment: Some(commitment.clone()), + }; + let encoded = postcard::to_stdvec(&req).expect("encode"); + let decoded: NeighborSyncRequest = postcard::from_bytes(&encoded).expect("new decoder"); + assert_eq!(decoded.commitment, Some(commitment)); + } + #[test] fn neighbor_sync_request_roundtrip() { let msg = ReplicationMessage { @@ -498,6 +813,7 @@ mod tests { replica_hints: vec![[0x01; 32], [0x02; 32]], paid_hints: vec![[0x03; 32]], bootstrapping: true, + commitment: None, }), }; let encoded = msg.encode().expect("encode should succeed"); @@ -522,6 +838,7 @@ mod tests { paid_hints: vec![], bootstrapping: false, rejected_keys: vec![[0x05; 32], [0x06; 32]], + commitment: None, }), }; let encoded = msg.encode().expect("encode should succeed"); diff --git a/src/replication/pruning.rs b/src/replication/pruning.rs index 4618ab09..68eebc51 100644 --- a/src/replication/pruning.rs +++ b/src/replication/pruning.rs @@ -17,6 +17,7 @@ use saorsa_core::{DHTNode, P2PNode}; use tokio::sync::RwLock; use crate::ant_protocol::XorName; +use crate::replication::commitment_state::ResponderCommitmentState; use crate::replication::config::{ ReplicationConfig, AUDIT_FAILURE_TRUST_WEIGHT, MAX_PRUNE_AUDIT_CHALLENGES_PER_PASS, REPLICATION_PROTOCOL_ID, @@ -74,6 +75,11 @@ pub struct PrunePassContext<'a> { pub current_sync_epoch: u64, /// Whether remote prune-confirmation audits are allowed this pass. pub allow_remote_prune_audits: bool, + /// Responder commitment state, used to veto deleting a chunk still held + /// under a recently-gossiped commitment (so the storage-commitment audit's + /// round-2 byte challenge cannot false-positive an honest node). `None` on + /// the legacy/test-only prune path, which keeps the pre-retention behavior. + pub commitment_state: Option<&'a Arc>, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -88,6 +94,7 @@ struct RecordPruneStats { marked: usize, cleared: usize, pruned: usize, + held_by_commitment: usize, } #[derive(Debug, Default)] @@ -122,6 +129,10 @@ enum RecordPruneKeyState { Cleared, BootstrapDeferred, BudgetDeferred, + /// Out of range, but still committed under a recently-gossiped commitment: + /// deletion is vetoed (and the out-of-range hysteresis clock is not even + /// started) until the key ages out of the last-2-gossiped window. + HeldByCommitment, Candidate(RecordPruneCandidate), } @@ -174,6 +185,7 @@ pub async fn run_prune_pass( repair_proofs: &repair_proofs, current_sync_epoch: 0, allow_remote_prune_audits, + commitment_state: None, }) .await } @@ -241,6 +253,9 @@ async fn prune_stored_records(ctx: &PrunePassContext<'_>) -> (usize, RecordPrune RecordPruneKeyState::BudgetDeferred => { budget_deferred = budget_deferred.saturating_add(1); } + RecordPruneKeyState::HeldByCommitment => { + stats.held_by_commitment = stats.held_by_commitment.saturating_add(1); + } RecordPruneKeyState::Candidate(candidate) => { last_selected_offset = Some(offset); candidates.push(candidate); @@ -270,6 +285,14 @@ async fn prune_stored_records(ctx: &PrunePassContext<'_>) -> (usize, RecordPrune ); } + if stats.held_by_commitment > 0 { + debug!( + "Vetoed {} prune candidate(s) still committed under a recently-gossiped \ + commitment (bounded reprieve until they age out of the retention window)", + stats.held_by_commitment + ); + } + let present_by_key = collect_record_prune_proofs( &candidates, ctx.storage, @@ -285,6 +308,7 @@ async fn prune_stored_records(ctx: &PrunePassContext<'_>) -> (usize, RecordPrune ctx.paid_list, ctx.p2p_node, ctx.config, + ctx.commitment_state, ) .await; stats.cleared += revalidated_cleared; @@ -317,6 +341,24 @@ async fn evaluate_record_prune_key( return outcome; } + // Retention veto: the key has left our close group, but if it is still + // committed under a recently-gossiped commitment a neighbour can pin that + // root and demand its bytes in a round-2 byte challenge. Deleting it now + // would turn an honest node's response into `Absent` → a confirmed audit + // failure. Veto deletion AND do not even start the out-of-range hysteresis + // clock yet: the commitment rebuild only commits to keys we are still + // responsible for, so this key drops out of the next rebuilt commitment and + // ages out of the last-2-gossiped window within at most + // `RETAINED_GOSSIPED_COMMITMENTS` gossip rotations, after which `is_held` + // returns false and the key prunes through the normal path. This is a + // bounded reprieve, not a permanent pin. + if let Some(cs) = ctx.commitment_state { + if cs.is_held(key) { + outcome.state = RecordPruneKeyState::HeldByCommitment; + return outcome; + } + } + if ctx.paid_list.record_out_of_range_since(key).is_none() { outcome.marked = true; } @@ -559,6 +601,7 @@ async fn revalidated_record_prune_keys( paid_list: &Arc, p2p_node: &Arc, config: &ReplicationConfig, + commitment_state: Option<&Arc>, ) -> (Vec, usize) { let dht = p2p_node.dht_manager(); let mut keys_to_delete = Vec::new(); @@ -566,6 +609,16 @@ async fn revalidated_record_prune_keys( let now = Instant::now(); for candidate in candidates { + // TOCTOU guard: a rotation/gossip may have (re-)committed this key + // between candidate selection and now. Re-check retention immediately + // before scheduling deletion so we never delete bytes a recently + // gossiped commitment still owes in a round-2 byte challenge. + if let Some(cs) = commitment_state { + if cs.is_held(&candidate.key) { + continue; + } + } + let closest: Vec = dht .find_closest_nodes_local_with_self(&candidate.key, config.close_group_size) .await; @@ -668,10 +721,21 @@ async fn peer_proves_record( let encoded = encode_prune_audit_challenge(&peer, key, challenge_id, nonce)?; let Some(decoded) = send_prune_audit_challenge(&peer, &key, encoded, p2p_node, config).await else { - // No decoded response means we did not observe the peer stop claiming - // bootstrap status. Preserve any active claim so a later claim is not - // misclassified as repeated abuse. - report_prune_audit_failure_once(&peer, &key, p2p_node, config, report_state).await; + // No decoded response means a timeout or an undecodable reply — the + // same "no response" case the main audit path treats as a timeout. + // TIMEOUT-EVICTION-DISABLED: do NOT penalise on a prune-audit timeout + // during the breaking rollout (a not-yet-upgraded peer, or a briefly + // slow one, must not be evicted by a no-response). This mirrors the + // suppressed timeout penalty in handle_failed_audit; only a DECODED + // PruneAuditStatus::Failed below (a peer that answered with bad/absent + // bytes) is penalised. Grep TIMEOUT-EVICTION-DISABLED to re-enable in + // the follow-up release once enough nodes have upgraded. + debug!( + "Prune audit for {peer} key {} got no decodable response \ + (eviction disabled this release — not penalising)", + hex::encode(key) + ); + // report_prune_audit_failure_once(&peer, &key, p2p_node, config, report_state).await; return None; }; @@ -699,6 +763,11 @@ fn prune_audit_response_clears_bootstrap_claim(status: PruneAuditStatus) -> bool matches!(status, PruneAuditStatus::Proven | PruneAuditStatus::Failed) } +// The responder for an incoming `AuditChallenge` (including prune-confirmation +// challenges, which reuse the same wire message) lives in +// `super::handle_audit_challenge_msg` -> `audit::handle_audit_challenge`, the +// responsible-chunk audit responder. No separate prune-only responder is needed. + fn encode_prune_audit_challenge( peer: &PeerId, key: XorName, @@ -740,7 +809,7 @@ async fn send_prune_audit_challenge( peer, REPLICATION_PROTOCOL_ID, encoded, - config.audit_response_timeout(1), + config.prune_audit_response_timeout, ) .await { diff --git a/src/replication/quorum.rs b/src/replication/quorum.rs index 5f4d99af..19186639 100644 --- a/src/replication/quorum.rs +++ b/src/replication/quorum.rs @@ -202,19 +202,52 @@ pub fn evaluate_key_evidence( evidence: &KeyVerificationEvidence, targets: &VerificationTargets, config: &ReplicationConfig, +) -> KeyVerificationOutcome { + evaluate_key_evidence_with_holder_check(key, evidence, targets, config, |_, _| true) +} + +/// Variant of [`evaluate_key_evidence`] that consults a holder-credit +/// predicate before counting a peer's Present evidence (v12 §6). +/// +/// `holder_credit` is invoked as `(peer, key) -> bool`. Returning `false` +/// downgrades a Present claim to Unresolved (we don't trust this peer's +/// "I have it" without a recent commitment-bound audit proving it). +/// Returning `true` keeps today's behaviour. Paid-list evidence is +/// independent of holder credit (the paid-list lookup is a property of +/// the receiving peer's own data, not a claim about K being present). +/// +/// The non-`_with_holder_check` form preserves prior behaviour by +/// passing a predicate that always returns true. New call sites that +/// have a `RecentProvers` cache + commitment-by-peer table should pass +/// a real predicate. +#[must_use] +pub fn evaluate_key_evidence_with_holder_check( + key: &XorName, + evidence: &KeyVerificationEvidence, + targets: &VerificationTargets, + config: &ReplicationConfig, + holder_credit: impl Fn(&PeerId, &XorName) -> bool, ) -> KeyVerificationOutcome { let quorum_peers = targets .quorum_targets .get(key) .map_or(&[][..], Vec::as_slice); - // Count presence evidence from QuorumTargets. + // Count presence evidence from QuorumTargets. v12 §6: a peer that + // claims Present but is not commitment-credited for K is downgraded + // to Unresolved (we may have to retry once they re-prove storage). let mut presence_positive = 0usize; let mut presence_unresolved = 0usize; for peer in quorum_peers { match evidence.presence.get(peer) { - Some(PresenceEvidence::Present) => presence_positive += 1, + Some(PresenceEvidence::Present) => { + if holder_credit(peer, key) { + presence_positive += 1; + } else { + presence_unresolved += 1; + } + } Some(PresenceEvidence::Absent) => {} Some(PresenceEvidence::Unresolved) | None => { presence_unresolved += 1; @@ -662,6 +695,108 @@ mod tests { ); } + // ----------------------------------------------------------------------- + // v12 §6 holder-credit predicate downgrades uncredited peers + // ----------------------------------------------------------------------- + + #[test] + fn quorum_downgrades_uncredited_present_peers() { + // 7 quorum peers, threshold 4. 4 say Present, 3 say Absent — + // would normally pass. But with a holder-credit predicate that + // only credits 2 of them, presence_positive drops to 2 and the + // 2 uncredited Presents become Unresolved. Total = 2 positive + // + 2 unresolved + 3 absent = 5 valid → still possible → + // QuorumInconclusive (not yet failed, but not verified either). + let key = xor_name_from_byte(0x33); + let config = ReplicationConfig::default(); + let quorum_peers: Vec = (1..=7).map(peer_id_from_byte).collect(); + let targets = single_key_targets(&key, quorum_peers.clone(), vec![]); + + let evidence = build_evidence( + vec![ + (quorum_peers[0], PresenceEvidence::Present), + (quorum_peers[1], PresenceEvidence::Present), + (quorum_peers[2], PresenceEvidence::Present), + (quorum_peers[3], PresenceEvidence::Present), + (quorum_peers[4], PresenceEvidence::Absent), + (quorum_peers[5], PresenceEvidence::Absent), + (quorum_peers[6], PresenceEvidence::Absent), + ], + vec![], + ); + + // Credit only the first two peers (the other two Presents are + // uncredited and will be downgraded to Unresolved). + let credit = |peer: &PeerId, _: &XorName| -> bool { + *peer == quorum_peers[0] || *peer == quorum_peers[1] + }; + let outcome = + evaluate_key_evidence_with_holder_check(&key, &evidence, &targets, &config, credit); + assert!( + matches!(outcome, KeyVerificationOutcome::QuorumInconclusive), + "credit downgrade should drop presence_positive below threshold, got {outcome:?}" + ); + } + + #[test] + fn quorum_passes_when_all_present_peers_are_credited() { + let key = xor_name_from_byte(0x34); + let config = ReplicationConfig::default(); + let quorum_peers: Vec = (1..=7).map(peer_id_from_byte).collect(); + let targets = single_key_targets(&key, quorum_peers.clone(), vec![]); + + let evidence = build_evidence( + (0..4) + .map(|i| (quorum_peers[i], PresenceEvidence::Present)) + .chain((4..7).map(|i| (quorum_peers[i], PresenceEvidence::Absent))) + .collect(), + vec![], + ); + + let credit = |_: &PeerId, _: &XorName| -> bool { true }; + let outcome = + evaluate_key_evidence_with_holder_check(&key, &evidence, &targets, &config, credit); + assert!( + matches!(outcome, KeyVerificationOutcome::QuorumVerified { .. }), + "all-credited Present should pass quorum, got {outcome:?}" + ); + } + + #[test] + fn paid_list_path_unaffected_by_holder_credit() { + // v12 §6: holder-credit gates Present claims, NOT paid-list + // evidence (the paid-list lookup is the receiving peer's own + // data, not a claim about K). A peer with no credit at all + // can still contribute to paid-list majority. + let key = xor_name_from_byte(0x35); + let config = ReplicationConfig::default(); + let quorum_peers: Vec = (1..=3).map(peer_id_from_byte).collect(); + let paid_peers: Vec = (10..=14).map(peer_id_from_byte).collect(); + let targets = single_key_targets(&key, quorum_peers.clone(), paid_peers.clone()); + + let evidence = build_evidence( + quorum_peers + .iter() + .map(|p| (*p, PresenceEvidence::Absent)) + .collect(), + vec![ + (paid_peers[0], PaidListEvidence::Confirmed), + (paid_peers[1], PaidListEvidence::Confirmed), + (paid_peers[2], PaidListEvidence::Confirmed), + (paid_peers[3], PaidListEvidence::NotFound), + (paid_peers[4], PaidListEvidence::NotFound), + ], + ); + + let credit = |_: &PeerId, _: &XorName| -> bool { false }; + let outcome = + evaluate_key_evidence_with_holder_check(&key, &evidence, &targets, &config, credit); + assert!( + matches!(outcome, KeyVerificationOutcome::PaidListVerified { .. }), + "paid-list path must not be gated by holder-credit, got {outcome:?}" + ); + } + // ----------------------------------------------------------------------- // evaluate_key_evidence: PaidListVerified // ----------------------------------------------------------------------- diff --git a/src/replication/recent_provers.rs b/src/replication/recent_provers.rs new file mode 100644 index 00000000..b793c228 --- /dev/null +++ b/src/replication/recent_provers.rs @@ -0,0 +1,355 @@ +//! Holder-eligibility cache: which peers recently proved storage of +//! which key, against which commitment. +//! +//! Phase 2d of the v12 storage-bound audit design (`notes/security- +//! findings-2026-05-22/proposal-gossip-audit-v12.md`). +//! +//! When the auditor successfully verifies a commitment-bound audit for +//! peer P on key K (against P's currently-credited commitment hash H), +//! it inserts `(P, H, now)` into `recent_provers[K]`. Reward / quorum +//! eligibility for P-as-holder-of-K then checks that this cache entry +//! still matches P's *currently credited* commitment hash; if P rotates +//! the hash via fresh gossip, the cache entry becomes stale and credit +//! is denied until the next successful audit against the new hash. +//! +//! Invariants enforced here: +//! +//! - **Per-key cap**: at most [`MAX_PROVERS_PER_KEY`] entries per key, +//! LRU-evicted by `proved_at`. Bounds the per-key working set so a +//! well-replicated key cannot fill memory. +//! - **RT-only**: only peers in the caller's routing table populate +//! entries — the caller is responsible for filtering before +//! [`RecentProvers::record_proof`]; this module just stores what it's +//! told. +//! - **Hash-bound credit**: [`RecentProvers::is_credited_holder`] +//! requires the cache entry's `commitment_hash` to match the peer's +//! *current* `commitment_hash`. A peer who proves K under C1 then +//! rotates to C2 loses credit until re-proving K under C2. +//! +//! - **TTL**: entries older than [`PROVER_ENTRY_TTL`] are ignored by +//! [`RecentProvers::is_credited_holder`] on read, and +//! [`RecentProvers::sweep_expired`] reclaims their memory when a +//! caller invokes it (e.g. periodically from the engine). +//! - **`PeerRemoved` cleanup**: the caller should call +//! [`RecentProvers::forget_peer`] when a peer leaves the routing +//! table to drop their entries immediately (faster than waiting for +//! TTL). + +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use saorsa_core::identity::PeerId; + +use crate::ant_protocol::XorName; + +/// Maximum number of cached provers per key. +/// +/// Sized at 2× `CLOSE_GROUP_SIZE = 8`, giving 8 slack slots for churn +/// without unbounded growth. LRU-evicted within the cap. +pub const MAX_PROVERS_PER_KEY: usize = 16; + +/// Maximum age of a cached prover entry before it is considered stale. +/// +/// A proof older than this is treated as "no credit" by +/// [`RecentProvers::is_credited_holder`] even if the commitment hash +/// still matches. +/// +/// v10/v12 §6 spec: `RECENT_PROOF_TTL = 2 × max audit interval` (≈40 min +/// at the default 20 min max). Setting too low → peers fall out of +/// credit between audits. Setting too high → lazy node has more leeway +/// before re-audit is required. 40 min comfortably covers one audit +/// cycle on the average peer while still requiring re-proof inside the +/// rotation window. +pub const PROVER_ENTRY_TTL: Duration = Duration::from_secs(40 * 60); + +/// One cached prover entry: who proved the key, when, and against which +/// commitment. +#[derive(Debug, Clone, Copy)] +pub struct ProverEntry { + /// The peer that produced the audit proof. + pub peer_id: PeerId, + /// When the proof was recorded. Used for LRU eviction. + pub proved_at: Instant, + /// The peer's commitment hash at proof time. Holder-eligibility + /// requires this to match the peer's *currently credited* hash. + pub commitment_hash: [u8; 32], +} + +/// Per-key cache of recent provers, capped at [`MAX_PROVERS_PER_KEY`]. +#[derive(Debug, Default, Clone)] +pub struct RecentProvers { + /// `entries[K]` is the per-key bounded list. Entries are kept sorted + /// by `proved_at` ascending so eviction is `O(1)` (drop head). + entries: HashMap>, +} + +impl RecentProvers { + /// Empty cache. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Record that `peer_id` proved storage of `key` under commitment + /// `commitment_hash` at `proved_at`. + /// + /// If the same `(peer_id, commitment_hash)` is already cached for + /// this key, the entry is updated in place (refreshes `proved_at`). + /// Otherwise a new entry is appended, evicting the oldest entry if + /// the per-key cap would be exceeded. + pub fn record_proof( + &mut self, + key: XorName, + peer_id: PeerId, + commitment_hash: [u8; 32], + proved_at: Instant, + ) { + let bucket = self.entries.entry(key).or_default(); + + // Refresh-in-place if the (peer, hash) already exists. + for e in bucket.iter_mut() { + if e.peer_id == peer_id && e.commitment_hash == commitment_hash { + e.proved_at = proved_at; + bucket.sort_by_key(|e| e.proved_at); + return; + } + } + + // Evict the oldest entry if we're at the cap. + if bucket.len() >= MAX_PROVERS_PER_KEY { + // bucket is sorted ascending; oldest is index 0. + bucket.remove(0); + } + + bucket.push(ProverEntry { + peer_id, + proved_at, + commitment_hash, + }); + bucket.sort_by_key(|e| e.proved_at); + } + + /// Is `peer_id` currently credited as a holder of `key`? + /// + /// Returns `true` iff there is a non-stale cached entry with `peer_id` + /// and `commitment_hash == current_commitment_hash`. + /// + /// "Non-stale" means `now - proved_at < PROVER_ENTRY_TTL`. The hash + /// binding is the v12 §6 lever: a peer that rotates their commitment + /// must re-prove every key they want credit for. The TTL is a + /// secondary safety net that revokes credit even if the hash + /// happens to match (e.g. a peer who proved long ago but has been + /// silent or offline since). + #[must_use] + pub fn is_credited_holder( + &self, + key: &XorName, + peer_id: &PeerId, + current_commitment_hash: &[u8; 32], + ) -> bool { + let now = Instant::now(); + self.entries.get(key).is_some_and(|bucket| { + bucket.iter().any(|e| { + &e.peer_id == peer_id + && &e.commitment_hash == current_commitment_hash + && now.saturating_duration_since(e.proved_at) < PROVER_ENTRY_TTL + }) + }) + } + + /// Sweep entries older than [`PROVER_ENTRY_TTL`] across all keys. + /// + /// Returns the number of entries dropped. Intended for periodic + /// invocation by a background task; `is_credited_holder` already + /// honours the TTL on read, so the sweep only reclaims memory. + pub fn sweep_expired(&mut self, now: Instant) -> usize { + let mut dropped = 0; + for bucket in self.entries.values_mut() { + let before = bucket.len(); + bucket.retain(|e| now.saturating_duration_since(e.proved_at) < PROVER_ENTRY_TTL); + dropped += before - bucket.len(); + } + self.entries.retain(|_, b| !b.is_empty()); + dropped + } + + /// Drop every cached entry for `peer_id` across all keys. + /// + /// Called when a peer leaves the routing table (RT-only invariant) + /// or on explicit eviction. + pub fn forget_peer(&mut self, peer_id: &PeerId) { + for bucket in self.entries.values_mut() { + bucket.retain(|e| &e.peer_id != peer_id); + } + self.entries.retain(|_, b| !b.is_empty()); + } + + /// Drop every entry whose `commitment_hash` matches `stale_hash` + /// (used when the auditor invalidates a peer's `last_commitment` — + /// e.g. on `UnknownCommitmentHash` rejection — to remove the cached + /// proofs against that no-longer-valid commitment). + pub fn forget_commitment(&mut self, stale_hash: &[u8; 32]) { + for bucket in self.entries.values_mut() { + bucket.retain(|e| &e.commitment_hash != stale_hash); + } + self.entries.retain(|_, b| !b.is_empty()); + } + + /// Number of cached entries for `key`. Test/observability helper. + #[must_use] + pub fn provers_for(&self, key: &XorName) -> usize { + self.entries.get(key).map_or(0, Vec::len) + } + + /// Total number of cached entries across all keys. + #[must_use] + pub fn total_entries(&self) -> usize { + self.entries.values().map(Vec::len).sum() + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used)] +mod tests { + use super::*; + use std::time::Duration; + + fn peer(byte: u8) -> PeerId { + let mut bytes = [0u8; 32]; + bytes[0] = byte; + PeerId::from_bytes(bytes) + } + + fn key(byte: u8) -> XorName { + let mut k = [0u8; 32]; + k[0] = byte; + k + } + + fn hash(byte: u8) -> [u8; 32] { + [byte; 32] + } + + #[test] + fn empty_cache_credits_no_one() { + let cache = RecentProvers::new(); + assert!(!cache.is_credited_holder(&key(1), &peer(1), &hash(1))); + assert_eq!(cache.total_entries(), 0); + } + + #[test] + fn recorded_proof_credits_under_same_hash() { + let mut cache = RecentProvers::new(); + cache.record_proof(key(1), peer(7), hash(0xAB), Instant::now()); + assert!(cache.is_credited_holder(&key(1), &peer(7), &hash(0xAB))); + } + + #[test] + fn rotated_hash_loses_credit() { + // Core v12 §6 attack-bound property: a peer who proves K under + // C1 must re-prove under C2 to keep credit. The cache entry's + // hash binding enforces this. + let mut cache = RecentProvers::new(); + cache.record_proof(key(1), peer(7), hash(0xAB), Instant::now()); + // Same peer, same key, but the auditor's "current" hash for + // this peer is now different (peer gossiped a new commitment). + assert!(!cache.is_credited_holder(&key(1), &peer(7), &hash(0xCD))); + } + + #[test] + fn other_peer_under_same_hash_not_credited() { + let mut cache = RecentProvers::new(); + cache.record_proof(key(1), peer(7), hash(0xAB), Instant::now()); + assert!(!cache.is_credited_holder(&key(1), &peer(8), &hash(0xAB))); + } + + #[test] + fn per_key_cap_evicts_oldest() { + let mut cache = RecentProvers::new(); + let now = Instant::now(); + // MAX_PROVERS_PER_KEY is a small usize (16). Narrow to u8 once + // so the test loop can hand the peer-id byte directly to + // `peer(...)` without per-iteration casts. + let max_u8 = u8::try_from(MAX_PROVERS_PER_KEY).unwrap_or(u8::MAX); + // Fill the bucket with MAX_PROVERS_PER_KEY + 1 distinct peers. + for i in 0..=max_u8 { + let t = now + Duration::from_millis(u64::from(i)); + cache.record_proof(key(1), peer(i), hash(0xAB), t); + } + assert_eq!(cache.provers_for(&key(1)), MAX_PROVERS_PER_KEY); + // The oldest (peer 0) should be evicted; peer MAX should be present. + assert!(!cache.is_credited_holder(&key(1), &peer(0), &hash(0xAB))); + assert!(cache.is_credited_holder(&key(1), &peer(max_u8), &hash(0xAB))); + } + + #[test] + fn refresh_in_place_does_not_grow_bucket() { + let mut cache = RecentProvers::new(); + let now = Instant::now(); + // Same (peer, hash) repeated three times. Bucket should stay at 1. + cache.record_proof(key(1), peer(1), hash(0xAB), now); + cache.record_proof(key(1), peer(1), hash(0xAB), now + Duration::from_secs(1)); + cache.record_proof(key(1), peer(1), hash(0xAB), now + Duration::from_secs(2)); + assert_eq!(cache.provers_for(&key(1)), 1); + } + + #[test] + fn forget_peer_drops_all_entries() { + let mut cache = RecentProvers::new(); + let now = Instant::now(); + cache.record_proof(key(1), peer(1), hash(0xAB), now); + cache.record_proof(key(2), peer(1), hash(0xAB), now); + cache.record_proof(key(1), peer(2), hash(0xAB), now); + assert_eq!(cache.total_entries(), 3); + + cache.forget_peer(&peer(1)); + assert_eq!(cache.total_entries(), 1); + assert!(!cache.is_credited_holder(&key(1), &peer(1), &hash(0xAB))); + assert!(cache.is_credited_holder(&key(1), &peer(2), &hash(0xAB))); + } + + #[test] + fn forget_commitment_drops_only_matching_entries() { + let mut cache = RecentProvers::new(); + let now = Instant::now(); + cache.record_proof(key(1), peer(1), hash(0xAB), now); + cache.record_proof(key(1), peer(1), hash(0xCD), now); + cache.record_proof(key(2), peer(2), hash(0xAB), now); + assert_eq!(cache.total_entries(), 3); + + cache.forget_commitment(&hash(0xAB)); + assert_eq!(cache.total_entries(), 1); + // Only the (peer(1), hash 0xCD) entry remains. + assert!(cache.is_credited_holder(&key(1), &peer(1), &hash(0xCD))); + assert!(!cache.is_credited_holder(&key(1), &peer(1), &hash(0xAB))); + assert!(!cache.is_credited_holder(&key(2), &peer(2), &hash(0xAB))); + } + + #[test] + fn lazy_rotation_via_unknown_commitment_hash_drops_credit() { + // Scenario from v12 §5 (revised UnknownCommitmentHash handler): + // 1. Peer P proves K under C1 → cached. + // 2. Auditor pinned to C1 sends a new challenge. + // 3. P replies UnknownCommitmentHash (they rotated and + // dropped the bytes). + // 4. Auditor invalidates last_commitment[P] AND calls + // forget_commitment(C1) so credit doesn't linger. + // + // Property checked: after forget_commitment(C1), P is no longer + // credited as holder of K under C1. + let mut cache = RecentProvers::new(); + cache.record_proof(key(1), peer(7), hash(0xAB), Instant::now()); + assert!(cache.is_credited_holder(&key(1), &peer(7), &hash(0xAB))); + + // Auditor detects rotation/dodge, invalidates the C1 hash. + cache.forget_commitment(&hash(0xAB)); + + assert!(!cache.is_credited_holder(&key(1), &peer(7), &hash(0xAB))); + // And under any new hash too — the peer has to re-prove. + assert!(!cache.is_credited_holder(&key(1), &peer(7), &hash(0xCD))); + } +} diff --git a/src/replication/storage_commitment_audit.rs b/src/replication/storage_commitment_audit.rs new file mode 100644 index 00000000..9cb1b577 --- /dev/null +++ b/src/replication/storage_commitment_audit.rs @@ -0,0 +1,1221 @@ +//! Gossip-triggered contiguous-subtree storage audit (ADR-0002). +//! +//! A node commits to what it stores (a signed Merkle [`StorageCommitment`] +//! gossiped to neighbours). On receiving a peer's changed commitment, a +//! neighbour may audit it: pin the just-gossiped root, send a fresh nonce that +//! deterministically selects one contiguous subtree, and require the peer to +//! prove that subtree (structure + real bytes) within a deadline. This module +//! owns the auditor entry point [`run_subtree_audit`] and the responder handler +//! [`handle_subtree_challenge`]; the pure proof maths live in +//! [`crate::replication::subtree`]. + +use std::sync::Arc; + +use crate::logging::{debug, info, warn}; +use rand::Rng; + +use crate::ant_protocol::XorName; +use crate::replication::commitment::{commitment_hash, StorageCommitment}; +use crate::replication::commitment_state::ResponderCommitmentState; +use crate::replication::config::{ + ReplicationConfig, MAX_BYTE_CHALLENGE_KEYS, REPLICATION_PROTOCOL_ID, +}; +use crate::replication::protocol::{ + ReplicationMessage, ReplicationMessageBody, SubtreeAuditChallenge, SubtreeAuditResponse, + SubtreeByteChallenge, SubtreeByteItem, SubtreeByteResponse, +}; +use crate::replication::recent_provers::RecentProvers; +use crate::replication::subtree::{ + select_spotcheck_indices, select_subtree_path, subtree_plan, verify_subtree_proof, + StructureVerdict, SubtreeProof, +}; +use crate::replication::types::{AuditFailureReason, AuditFailureSummary, FailureEvidence}; +use crate::storage::LmdbStorage; +use saorsa_core::identity::PeerId; +use saorsa_core::P2PNode; +use tokio::sync::RwLock; + +// The gossip-triggered auditor shares the engine's [`AuditTickResult`] outcome +// type with the responsible-chunk audit (defined in [`super::audit`]), so the +// engine can dispatch both audits' results through one match. +use crate::replication::audit::AuditTickResult; + +// --------------------------------------------------------------------------- +// Auditor side +// --------------------------------------------------------------------------- + +/// ADR-0002 round-2 byte challenge samples a SMALL surprise set of the proven +/// leaves (3..=5). Small enough that the responder's honest local-disk read of +/// the original chunks stays well inside the possession-in-time deadline, while +/// a relay forced to fetch them over the network blows it; large enough that +/// faking a fraction `x` of leaves survives only `(1 - x)^k`. +const BYTE_SPOTCHECK_MIN: u32 = 3; +const BYTE_SPOTCHECK_MAX: u32 = 5; + +/// Holder-eligibility cache the auditor credits on a passing audit. +/// +/// Owned by [`crate::replication::ReplicationEngine`]; borrowed here so a +/// passing audit can record `(peer, commitment_hash)` as a proven holder for +/// downstream quorum / paid-list credit. +pub struct AuditCredit<'a> { + /// Holder-eligibility cache. + pub recent_provers: &'a Arc>, +} + +/// The cross-cutting context for verifying one audit response, bundled so the +/// response-dispatch and verification functions stay readable. +struct AuditCtx<'a> { + p2p_node: &'a Arc, + challenged_peer: &'a PeerId, + challenge_id: u64, + nonce: [u8; 32], + expected_commitment_hash: [u8; 32], + config: &'a ReplicationConfig, + credit: Option<&'a AuditCredit<'a>>, +} + +/// Run one gossip-triggered subtree audit against `challenged_peer`, pinned to +/// the commitment hash the peer just gossiped (`expected_commitment_hash`). +/// +/// ADR-0002 two-round audit. The auditor sends a fresh random nonce and runs: +/// +/// 1. **Structure** (round 1) — the returned subtree rebuilds to the pinned +/// root, within a size-scaled deadline. +/// 2. **Real bytes** (round 2) — the auditor demands the ORIGINAL chunk content +/// for a 3..=5 nonce-selected sample of the proven leaves FROM the responder, +/// and recomputes both the content-address hash and the nonce freshness hash +/// from that served content. The auditor holds none of the peer's chunks. +/// 3. **Timing** — each round's deadline is sized to an honest local-disk read, +/// so a relay forced to fetch over the network blows it. +/// +/// A timeout (either round) is reported as [`AuditFailureReason::Timeout`] (the +/// caller applies the strike/grace policy). Any structural failure, served +/// content that fails a hash, an explicit `Absent` for a committed sampled key, +/// or a rejection of a recently gossiped commitment, is a confirmed failure +/// acted on immediately. On a full pass, records the peer as a proven holder. +pub async fn run_subtree_audit( + p2p_node: &Arc, + config: &ReplicationConfig, + challenged_peer: &PeerId, + expected_commitment_hash: [u8; 32], + key_count: u32, + credit: Option<&AuditCredit<'_>>, +) -> AuditTickResult { + let (nonce, challenge_id) = { + let mut rng = rand::thread_rng(); + (rng.gen::<[u8; 32]>(), rng.gen::()) + }; + + let challenge = SubtreeAuditChallenge { + challenge_id, + nonce, + challenged_peer_id: *challenged_peer.as_bytes(), + expected_commitment_hash, + }; + let msg = ReplicationMessage { + request_id: challenge_id, + body: ReplicationMessageBody::SubtreeAuditChallenge(challenge), + }; + let encoded = match msg.encode() { + Ok(data) => data, + Err(e) => { + warn!("Audit: failed to encode subtree challenge for {challenged_peer}: {e}"); + return AuditTickResult::Idle; + } + }; + + // Size the proof deadline from the ACTUAL selected subtree (its real-leaf + // count for this nonce + key_count), not a fixed worst-case hint. This keeps + // the deadline tight to "responder hashes ~sqrt(N) chunks at local-disk + // speed", so a relay that must fetch the subtree over the network blows it. + // The auditor and responder derive the same selection, so we know the leaf + // count before the response arrives. + let subtree_leaves = select_subtree_path(&nonce, key_count).map_or_else( + || config.subtree_audit_timeout_leaf_hint(), + |p| p.real_leaf_count() as usize, + ); + let timeout = config.audit_response_timeout(subtree_leaves); + + let response = match p2p_node + .send_request(challenged_peer, REPLICATION_PROTOCOL_ID, encoded, timeout) + .await + { + Ok(resp) => resp, + Err(e) => { + debug!("Audit: subtree challenge to {challenged_peer} timed out / failed: {e}"); + return failed(challenged_peer, challenge_id, AuditFailureReason::Timeout); + } + }; + + let resp_msg = match ReplicationMessage::decode(&response.data) { + Ok(m) => m, + Err(e) => { + warn!("Audit: failed to decode subtree response from {challenged_peer}: {e}"); + return failed( + challenged_peer, + challenge_id, + AuditFailureReason::MalformedResponse, + ); + } + }; + + let ctx = AuditCtx { + p2p_node, + challenged_peer, + challenge_id, + nonce, + expected_commitment_hash, + config, + credit, + }; + dispatch_subtree_response(resp_msg.body, &ctx).await +} + +/// Outcome of the round-2 byte challenge round-trip (auditor side). +enum ByteRound { + /// The responder returned per-key items (verified by the caller). + Served(Vec), + /// The responder rejected the byte challenge (confirmed failure for a + /// recently pinned commitment). + Rejected, + /// No response within the byte deadline, or a transport error (graced + /// timeout). + Timeout, + /// Malformed / unexpected round-2 response body. + Malformed, +} + +/// Round 2: ask the responder for the ORIGINAL chunk content of one BATCH of +/// auditor-selected spot-check `keys` (at most [`MAX_BYTE_CHALLENGE_KEYS`], so +/// the worst-case response of max-size chunks fits the wire cap), sized to a +/// possession-in-time deadline (honest local-disk read of `keys.len()` chunks). +/// The responder cannot have predicted which keys are sampled. +async fn request_byte_proof(ctx: &AuditCtx<'_>, keys: &[XorName]) -> ByteRound { + let challenge = SubtreeByteChallenge { + challenge_id: ctx.challenge_id, + nonce: ctx.nonce, + challenged_peer_id: *ctx.challenged_peer.as_bytes(), + expected_commitment_hash: ctx.expected_commitment_hash, + keys: keys.to_vec(), + }; + let msg = ReplicationMessage { + request_id: ctx.challenge_id, + body: ReplicationMessageBody::SubtreeByteChallenge(challenge), + }; + let encoded = match msg.encode() { + Ok(data) => data, + Err(e) => { + warn!("Audit: failed to encode byte challenge: {e}"); + return ByteRound::Malformed; + } + }; + + // Deadline sized to "honest responder reads `keys.len()` local chunks": a + // relay forced to fetch them over the network blows it (graced timeout, + // never a confirmed failure — same possession-in-time principle as round 1). + let timeout = ctx.config.audit_response_timeout(keys.len()); + let response = match ctx + .p2p_node + .send_request( + ctx.challenged_peer, + REPLICATION_PROTOCOL_ID, + encoded, + timeout, + ) + .await + { + Ok(resp) => resp, + Err(e) => { + debug!( + "Audit: byte challenge to {} timed out / failed: {e}", + ctx.challenged_peer + ); + return ByteRound::Timeout; + } + }; + + let resp_msg = match ReplicationMessage::decode(&response.data) { + Ok(m) => m, + Err(e) => { + warn!("Audit: failed to decode byte response: {e}"); + return ByteRound::Malformed; + } + }; + + match resp_msg.body { + ReplicationMessageBody::SubtreeByteResponse(SubtreeByteResponse::Items { + challenge_id, + items, + }) if challenge_id == ctx.challenge_id => ByteRound::Served(items), + ReplicationMessageBody::SubtreeByteResponse(SubtreeByteResponse::Rejected { + challenge_id, + reason, + }) if challenge_id == ctx.challenge_id => { + warn!( + "Audit: {} rejected byte challenge: {reason}", + ctx.challenged_peer + ); + ByteRound::Rejected + } + // A node claiming bootstrap MID-AUDIT (it answered round 1) is treated + // as a timeout: it didn't prove possession but the round-1 proof shows + // it isn't bootstrapping, so the bootstrap-claim-abuse detector (round 1) + // owns that lane; here we just don't credit it. + ReplicationMessageBody::SubtreeByteResponse(SubtreeByteResponse::Bootstrapping { + challenge_id, + }) if challenge_id == ctx.challenge_id => ByteRound::Timeout, + _ => ByteRound::Malformed, + } +} + +/// Map a decoded response body to an audit outcome (auditor side). A response +/// whose `challenge_id` doesn't match, or any non-subtree body, is malformed. +async fn dispatch_subtree_response( + body: ReplicationMessageBody, + ctx: &AuditCtx<'_>, +) -> AuditTickResult { + let challenged_peer = ctx.challenged_peer; + let challenge_id = ctx.challenge_id; + let malformed = || { + failed( + challenged_peer, + challenge_id, + AuditFailureReason::MalformedResponse, + ) + }; + match body { + ReplicationMessageBody::SubtreeAuditResponse(SubtreeAuditResponse::Bootstrapping { + challenge_id: resp_id, + }) => { + if resp_id != challenge_id { + return malformed(); + } + AuditTickResult::BootstrapClaim { + peer: *challenged_peer, + } + } + ReplicationMessageBody::SubtreeAuditResponse(SubtreeAuditResponse::Rejected { + challenge_id: resp_id, + reason, + }) => { + if resp_id != challenge_id { + return malformed(); + } + // ADR-0002: the auditor only ever pins a commitment the peer JUST + // gossiped, and an honest responder retains its last two gossiped + // commitments. So a rejection of a freshly pinned root is a + // confirmed failure (repudiating what you just published), not + // benign staleness. There is no no-penalty lane. + warn!("Audit: peer {challenged_peer} rejected subtree challenge: {reason}"); + failed(challenged_peer, challenge_id, AuditFailureReason::Rejected) + } + ReplicationMessageBody::SubtreeAuditResponse(SubtreeAuditResponse::Proof { + challenge_id: resp_id, + commitment, + proof, + }) => { + if resp_id != challenge_id { + return malformed(); + } + verify_subtree_response(ctx, &commitment, &proof).await + } + _ => { + warn!("Audit: unexpected response type from {challenged_peer}"); + malformed() + } + } +} + +/// The pure verdict of evaluating a subtree-audit response, independent of +/// storage/network. Tests call this directly so the SHIPPED gate logic is what +/// gets exercised (no reimplementation that could drift). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum AuditVerdict { + /// All gates passed and at least one leaf was byte-verified. + Pass { + /// Number of leaves whose real bytes were verified in round 2. + checked: usize, + }, + /// A confirmed failure with this reason (penalizable / acted upon). + Fail(AuditFailureReason), +} + +/// Round-1 structural evaluation of a subtree-audit proof (ADR-0002). +/// +/// Runs the cheap gates in fail-fast order: pin / identity / signature → +/// structure (the returned subtree rebuilds to the pinned root). It does **not** +/// prove byte possession — the leaves carry only the public `bytes_hash` (the +/// chunk address) and a `nonced_hash` the responder computed itself. Possession +/// is proven in round 2 ([`verify_byte_response`]), where the auditor demands +/// the original chunk bytes for a nonce-selected sample and recomputes both +/// hashes from the SERVED content. This removes any dependency on the auditor +/// holding the peer's chunks. +/// +/// Returns [`StructureVerdict::Valid`] (proceed to round 2) or a confirmed +/// [`AuditFailureReason`] mapped from the failing gate. +pub(crate) fn evaluate_subtree_structure( + commitment: &StorageCommitment, + proof: &SubtreeProof, + nonce: &[u8; 32], + expected_commitment_hash: &[u8; 32], + challenged_peer_bytes: &[u8; 32], +) -> Result<(), AuditFailureReason> { + // -- Pin + identity + signature -- + if &commitment.sender_peer_id != challenged_peer_bytes { + return Err(AuditFailureReason::Rejected); + } + let derived_peer_id = *blake3::hash(&commitment.sender_public_key).as_bytes(); + if derived_peer_id != commitment.sender_peer_id { + return Err(AuditFailureReason::Rejected); + } + match commitment_hash(commitment) { + Some(h) if &h == expected_commitment_hash => {} + _ => return Err(AuditFailureReason::Rejected), + } + if !crate::replication::commitment::verify_commitment_signature(commitment) { + return Err(AuditFailureReason::Rejected); + } + + // -- Structure -- + if let StructureVerdict::Invalid(_) = verify_subtree_proof(proof, nonce, commitment) { + return Err(AuditFailureReason::DigestMismatch); + } + Ok(()) +} + +/// The auditor's nonce-derived spot-check sample of the round-1 subtree: the +/// distinct leaves (in proof order) whose original bytes the auditor will demand +/// in round 2. Empty only if the proof is empty (cannot happen post-structure). +pub(crate) fn spotcheck_leaves<'a>( + proof: &'a SubtreeProof, + nonce: &[u8; 32], + key_count: u32, + spotcheck_count: u32, +) -> Vec<&'a crate::replication::subtree::SubtreeLeaf> { + let Some(path) = select_subtree_path(nonce, key_count) else { + return Vec::new(); + }; + let mut out = Vec::new(); + for idx in select_spotcheck_indices(nonce, &path, spotcheck_count) { + if let Some(leaf) = proof.leaves.get(idx as usize) { + out.push(leaf); + } + } + out +} + +/// Round-2 verdict (ADR-0002): the responder served the original chunk content +/// for the auditor's spot-check sample; verify possession from THAT content. +/// +/// `served(key)` returns what the responder returned for a requested key: +/// `Some(Some(bytes))` for [`SubtreeByteItem::Present`], `Some(None)` for an +/// explicit [`SubtreeByteItem::Absent`], and `None` if the responder omitted the +/// key entirely (treated like `Absent` — a committed key it would not serve). +/// +/// For each sampled leaf the auditor recomputes, from the SERVED content: +/// - `BLAKE3(content) == leaf.bytes_hash` (the chunk's content address), AND +/// - `BLAKE3(nonce ‖ peer ‖ key ‖ content) == leaf.nonced_hash` (freshness), +/// i.e. `compute_audit_digest(nonce, peer, key, content)`. +/// +/// The freshness inputs are byte-identical to what the responder used to BUILD +/// the leaf in round 1 (`subtree_leaf` → `nonced_leaf_hash`): the SAME four +/// inputs, so an honest holder's served content reproduces `nonced_hash` +/// exactly. Round 1 commits over the data (the `nonced_hash` is uncomputable +/// without the bytes); round 2 reveals a random subset to prove the commitment +/// was not fabricated. +/// +/// Both checks are over the content the responder sent, so the auditor needs to +/// hold none of the peer's chunks. Any `Absent`/omitted committed key, or any +/// served content that fails a hash, is a provable lie → confirmed +/// [`AuditFailureReason::DigestMismatch`]. All sampled leaves verifying → +/// `Pass { checked }`. +pub(crate) fn verify_byte_response( + leaves: &[&crate::replication::subtree::SubtreeLeaf], + nonce: &[u8; 32], + challenged_peer_bytes: &[u8; 32], + served: impl Fn(&XorName) -> Option>>, +) -> AuditVerdict { + let mut checked = 0usize; + for leaf in leaves { + // Present{bytes} -> Some(Some(bytes)); Absent -> Some(None); omitted -> None. + // A committed key the responder cannot / will not serve is a provable lie. + let Some(Some(content)) = served(&leaf.key) else { + return AuditVerdict::Fail(AuditFailureReason::DigestMismatch); + }; + let plain = *blake3::hash(&content).as_bytes(); + let nonced = crate::replication::subtree::nonced_leaf_hash( + nonce, + challenged_peer_bytes, + &leaf.key, + &content, + ); + if leaf.bytes_hash != plain || leaf.nonced_hash != nonced { + // Served content does not hash to the committed address / freshness + // hash: cannot be the chunk it committed to. + return AuditVerdict::Fail(AuditFailureReason::DigestMismatch); + } + checked += 1; + } + AuditVerdict::Pass { checked } +} + +/// Verify a subtree-proof response (auditor side), ADR-0002 two-round audit. +/// +/// **Round 1** (this proof): pin + identity + signature + structure. If the +/// proof structurally rebuilds to the pinned root, the tree SHAPE is committed — +/// but not yet that the bytes are held. **Round 2**: the auditor picks a small +/// nonce-selected sample of the just-proven leaves and sends a +/// [`SubtreeByteChallenge`] demanding their original chunk content FROM the +/// responder, then verifies that content against the committed `bytes_hash` +/// (content address) and `nonced_hash` (freshness). A responder that committed +/// to a chunk it no longer holds cannot serve content that hashes to the +/// committed address, so it fails — regardless of what the auditor holds. On a +/// full pass, credits the peer as a proven holder. +async fn verify_subtree_response( + ctx: &AuditCtx<'_>, + commitment: &StorageCommitment, + proof: &SubtreeProof, +) -> AuditTickResult { + let challenged_peer = ctx.challenged_peer; + let challenge_id = ctx.challenge_id; + + // -- Round 1: pin/identity/signature + structure (no bytes). -- + if let Err(reason) = evaluate_subtree_structure( + commitment, + proof, + &ctx.nonce, + &ctx.expected_commitment_hash, + challenged_peer.as_bytes(), + ) { + warn!("Audit: {challenged_peer} failed subtree structure ({reason:?})"); + return failed(challenged_peer, challenge_id, reason); + } + + // -- Round 2: surprise byte challenge for a 3..=5 nonce-selected sample. -- + // The responder cannot predict which leaves are sampled, and must serve the + // ORIGINAL content for each. We cap the sample at the ADR's 3..=5 band + // (clamped to the subtree size) so the round-2 message and the responder's + // disk read stay cheap. + let sample_n = ctx + .config + .audit_spotcheck_count() + .clamp(BYTE_SPOTCHECK_MIN, BYTE_SPOTCHECK_MAX); + let sampled = spotcheck_leaves(proof, &ctx.nonce, commitment.key_count, sample_n); + if sampled.is_empty() { + // Cannot happen after a valid structure (subtree is never empty), but + // guard rather than credit an unproven peer. + warn!("Audit: {challenged_peer} produced an empty spot-check sample; rejecting"); + return failed( + challenged_peer, + challenge_id, + AuditFailureReason::DigestMismatch, + ); + } + let sampled_keys: Vec = sampled.iter().map(|l| l.key).collect(); + + // The sample is challenged in batches of MAX_BYTE_CHALLENGE_KEYS so each + // response — worst case, every requested chunk at MAX_CHUNK_SIZE — still + // encodes under MAX_REPLICATION_MESSAGE_SIZE. Each batch carries its own + // possession-in-time deadline (sized to its own length), so splitting does + // not widen the per-chunk window a relay would need to fetch over the + // network. The full sample is then verified as one set. + let mut served = Vec::with_capacity(sampled_keys.len()); + let mut round_failure = None; + for batch in sampled_keys.chunks(MAX_BYTE_CHALLENGE_KEYS) { + match request_byte_proof(ctx, batch).await { + ByteRound::Served(items) => served.extend(items), + // The responder rejected the byte challenge for a recently pinned + // commitment → confirmed failure, same as a round-1 rejection. + ByteRound::Rejected => { + round_failure = Some(AuditFailureReason::Rejected); + break; + } + // No response within the byte deadline (or transport error) → + // timeout (graced by the caller's strike policy — could be honest + // slowness). + ByteRound::Timeout => { + round_failure = Some(AuditFailureReason::Timeout); + break; + } + // Malformed/unexpected round-2 body. + ByteRound::Malformed => { + round_failure = Some(AuditFailureReason::MalformedResponse); + break; + } + } + } + let verdict = round_failure.map_or_else( + || { + verify_byte_response(&sampled, &ctx.nonce, challenged_peer.as_bytes(), |key| { + served.iter().find_map(|it| match it { + SubtreeByteItem::Present { key: k, bytes } if k == key => { + Some(Some(bytes.clone())) + } + SubtreeByteItem::Absent { key: k } if k == key => Some(None), + _ => None, + }) + }) + }, + AuditVerdict::Fail, + ); + + match verdict { + AuditVerdict::Fail(reason) => { + warn!("Audit: {challenged_peer} failed subtree audit ({reason:?})"); + failed(challenged_peer, challenge_id, reason) + } + AuditVerdict::Pass { checked } => { + // Closeness (ADR-0002, soft/observe-only) — see observe_closeness. + observe_closeness(ctx.p2p_node, ctx.config, challenged_peer, proof).await; + // Credit the peer as a proven holder of its committed keys. + if let (Some(credit), Some(pin)) = (ctx.credit, commitment_hash(commitment)) { + let now = std::time::Instant::now(); + let mut provers = credit.recent_provers.write().await; + for leaf in &proof.leaves { + provers.record_proof(leaf.key, *challenged_peer, pin, now); + } + } + info!( + "Audit: peer {challenged_peer} passed subtree audit ({} leaves, {checked} \ + byte-checked)", + proof.leaves.len() + ); + AuditTickResult::Passed { + challenged_peer: *challenged_peer, + keys_checked: checked, + } + } + } +} + +/// Soft, density-aware closeness observation (ADR-0002). Logs — never fails — +/// when a suspicious fraction of the proof's leaves are keys the auditor itself +/// is NOT responsible for (a proxy for "implausibly far from the peer"). +/// +/// Using the auditor's own `SelfInclusiveRT` responsibility as the yardstick +/// makes this density-aware for free: on a small/dense network the auditor is +/// close to nearly every key, so almost nothing reads as far and no honest peer +/// is ever flagged. Enforcement is intentionally deferred until a testnet +/// calibrates the density threshold. +async fn observe_closeness( + p2p_node: &Arc, + config: &ReplicationConfig, + challenged_peer: &PeerId, + proof: &SubtreeProof, +) { + let self_id = *p2p_node.peer_id(); + let mut far = 0usize; + for leaf in &proof.leaves { + if !crate::replication::admission::is_responsible( + &self_id, + &leaf.key, + p2p_node, + config.close_group_size, + ) + .await + { + far += 1; + } + } + // Only worth a line when MOST of the proof is far — that's the padding + // shape. A normal proof on a sparse network has some far keys; that's fine. + let total = proof.leaves.len(); + if total > 0 && far * 2 > total { + debug!( + "Audit: closeness signal — {far}/{total} of {challenged_peer}'s proven leaves are \ + keys this auditor is not close to (observe-only; possible padding, not penalized)" + ); + } +} + +/// Build a confirmed-failure result. The auditor pinned a commitment the peer +/// committed to itself, so there is no per-key responsibility to re-confirm: +/// the failure is about the peer's own committed tree. +/// +/// The subtree audit fails a peer as a whole (one challenge, one verdict) rather +/// than per-key, so the [`AuditFailureSummary`] is a single-failure rollup +/// mapped from `reason` — enough for the shared audit-failure diagnostics log +/// line (`absent_keys`/`digest_mismatch_keys`) without inventing per-key counts +/// this audit shape does not have. +fn failed( + challenged_peer: &PeerId, + challenge_id: u64, + reason: AuditFailureReason, +) -> AuditTickResult { + let summary = subtree_failure_summary(&reason); + AuditTickResult::Failed { + evidence: FailureEvidence::AuditFailure { + challenge_id, + challenged_peer: *challenged_peer, + confirmed_failed_keys: Vec::new(), + summary, + reason, + }, + } +} + +/// Map a subtree-audit `reason` to a single-failure [`AuditFailureSummary`]. +/// +/// A `Timeout` is not (yet) a confirmed failure (it is graced), so it rolls up +/// as zero confirmed failures; every other reason is one confirmed failure, +/// categorised where the category is meaningful (byte/nonce/root mismatch → +/// `digest_mismatch_keys`; explicit absent → `absent_keys`). +fn subtree_failure_summary(reason: &AuditFailureReason) -> AuditFailureSummary { + let mut summary = AuditFailureSummary { + challenged_keys: 1, + ..AuditFailureSummary::default() + }; + match reason { + AuditFailureReason::Timeout => {} + AuditFailureReason::DigestMismatch => { + summary.failed_keys = 1; + summary.digest_mismatch_keys = 1; + } + AuditFailureReason::KeyAbsent => { + summary.failed_keys = 1; + summary.absent_keys = 1; + } + AuditFailureReason::MalformedResponse | AuditFailureReason::Rejected => { + summary.failed_keys = 1; + } + } + summary +} + +// --------------------------------------------------------------------------- +// Responder side +// --------------------------------------------------------------------------- + +/// Handle an incoming subtree audit challenge (responder side). +/// +/// Validates the challenge targets this node, looks up the pinned commitment in +/// the retained (last-two-gossiped) set, and builds the subtree proof for the +/// nonce-selected branch. If this node is bootstrapping it says so; if it +/// genuinely does not retain the pinned commitment it rejects (which the +/// auditor treats as a confirmed failure for a recently gossiped root). +pub async fn handle_subtree_challenge( + challenge: &SubtreeAuditChallenge, + storage: &LmdbStorage, + self_peer_id: &PeerId, + is_bootstrapping: bool, + commitment_state: Option<&Arc>, +) -> SubtreeAuditResponse { + if is_bootstrapping { + return SubtreeAuditResponse::Bootstrapping { + challenge_id: challenge.challenge_id, + }; + } + + if challenge.challenged_peer_id != *self_peer_id.as_bytes() { + warn!( + "Subtree audit challenge targeted wrong peer: expected {}, got {}", + hex::encode(self_peer_id.as_bytes()), + hex::encode(challenge.challenged_peer_id), + ); + return SubtreeAuditResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "challenged_peer_id does not match this node".to_string(), + }; + } + + let Some(state) = commitment_state else { + return SubtreeAuditResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "no commitment state".to_string(), + }; + }; + + // Look up the pinned commitment among the last-two-gossiped retained set. + let Some(built) = state.lookup_by_hash(&challenge.expected_commitment_hash) else { + return SubtreeAuditResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "unknown commitment hash".to_string(), + }; + }; + + // Geometry first (no bytes touched): which leaves to prove + the sibling + // cut-hashes from the committed tree. + let plan = match subtree_plan(built.tree(), &challenge.nonce) { + Ok(p) => p, + Err(e) => { + warn!("Subtree audit: failed to plan proof: {e:?}"); + return SubtreeAuditResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "could not build subtree proof".to_string(), + }; + } + }; + + // Read chunk bytes one leaf at a time so peak memory is bounded regardless + // of subtree size, hashing each into its plain + nonced leaf. + let mut leaves = Vec::with_capacity(plan.leaf_keys.len()); + for key in &plan.leaf_keys { + let Ok(Some(bytes)) = storage.get_raw(key).await else { + // Key is in our committed tree but we cannot read its bytes — real + // storage loss / deliberate non-response. For a recently gossiped + // pin the auditor counts this rejection as a confirmed failure. + warn!( + "Subtree audit: missing bytes for committed key {}", + hex::encode(key) + ); + return SubtreeAuditResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: format!("missing bytes for committed key: {}", hex::encode(key)), + }; + }; + leaves.push(crate::replication::subtree::subtree_leaf( + &challenge.nonce, + &challenge.challenged_peer_id, + key, + &bytes, + )); + // bytes drops here. + } + + SubtreeAuditResponse::Proof { + challenge_id: challenge.challenge_id, + commitment: built.commitment().clone(), + proof: SubtreeProof { + leaves, + sibling_cut_hashes: plan.sibling_cut_hashes, + }, + } +} + +/// Handle a round-2 byte challenge (responder side), ADR-0002. +/// +/// The auditor has already structurally verified this node's round-1 subtree +/// proof and now demands the ORIGINAL chunk bytes for a small nonce-selected +/// sample of those leaves. For each requested key the responder either returns +/// the bytes ([`SubtreeByteItem::Present`]) or — if it committed to the key but +/// can no longer produce it — an explicit [`SubtreeByteItem::Absent`], which the +/// auditor counts as a provable failure (committing to bytes you don't hold). +/// +/// A key the responder never committed to (not in the pinned tree) is also +/// returned `Absent`: the auditor only ever samples keys it saw in round 1, so +/// in practice this guards against a malformed/forged byte challenge rather than +/// an honest mismatch. +pub async fn handle_subtree_byte_challenge( + challenge: &SubtreeByteChallenge, + storage: &LmdbStorage, + self_peer_id: &PeerId, + is_bootstrapping: bool, + commitment_state: Option<&Arc>, +) -> SubtreeByteResponse { + if is_bootstrapping { + return SubtreeByteResponse::Bootstrapping { + challenge_id: challenge.challenge_id, + }; + } + + if challenge.challenged_peer_id != *self_peer_id.as_bytes() { + return SubtreeByteResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "challenged_peer_id does not match this node".to_string(), + }; + } + + // An honest auditor batches its sample to MAX_BYTE_CHALLENGE_KEYS per + // challenge so the worst-case response fits the wire cap. Reject larger + // requests up front: serving them could only produce an unencodable + // response (and invites disk-read amplification from a forged auditor). + if challenge.keys.len() > MAX_BYTE_CHALLENGE_KEYS { + let requested = challenge.keys.len(); + return SubtreeByteResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: format!( + "byte challenge requests {requested} keys; max {MAX_BYTE_CHALLENGE_KEYS} per challenge" + ), + }; + } + + let Some(state) = commitment_state else { + return SubtreeByteResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "no commitment state".to_string(), + }; + }; + // Resolve the SAME commitment the auditor pinned in round 1. If we no longer + // retain it (it aged out of the last-two-gossiped set), reject — for a + // recently gossiped pin the auditor treats this as a confirmed failure, like + // round 1. We serve bytes only for keys actually committed to under this pin. + let Some(built) = state.lookup_by_hash(&challenge.expected_commitment_hash) else { + return SubtreeByteResponse::Rejected { + challenge_id: challenge.challenge_id, + reason: "unknown commitment hash".to_string(), + }; + }; + let committed = |key: &XorName| -> bool { built.proof_for(key).is_some() }; + + let mut items = Vec::with_capacity(challenge.keys.len()); + for key in &challenge.keys { + // Read the original bytes for the requested, committed key. + if let Ok(Some(bytes)) = storage.get_raw(key).await { + items.push(SubtreeByteItem::Present { key: *key, bytes }); + } else { + // Committed to the key but cannot read its bytes → provable failure. + if committed(key) { + warn!( + "Subtree byte audit: committed key {} requested but bytes absent", + hex::encode(key) + ); + } + items.push(SubtreeByteItem::Absent { key: *key }); + } + } + + SubtreeByteResponse::Items { + challenge_id: challenge.challenge_id, + items, + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::*; + use crate::replication::commitment_state::BuiltCommitment; + use crate::replication::subtree::{build_subtree_proof, nonced_leaf_hash, SubtreeLeaf}; + use saorsa_pqc::api::sig::ml_dsa_65; + + // The two-round audit splits into SHIPPED pure functions exercised directly + // here (no reimplementation that could drift): + // - round 1: `evaluate_subtree_structure` (pin/identity/signature + + // structural root rebuild), + // - sampling: `spotcheck_leaves` (the 3..=5 nonce-selected leaves), and + // - round 2: `verify_byte_response` (recompute content-address + freshness + // from the bytes the RESPONDER served — the auditor holds nothing). + + fn key(i: u32) -> XorName { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_be_bytes()); + k + } + /// The "chunk content" for a key in these fixtures. The committed tree's leaf + /// `bytes_hash` is `BLAKE3(chunk_bytes(key))`, mirroring the general + /// `(key, BLAKE3(content))` commitment; round 2 serves exactly this content. + fn chunk_bytes(k: &XorName) -> Vec { + let mut v = k.to_vec(); + v.extend_from_slice(b"chunk-body"); + v + } + + /// Build an honest committed tree of `n` keys + a valid round-1 proof for + /// `nonce`. Returns `(built, proof, peer_id)`. The auditor pins `built.hash()`. + fn honest(n: u32, nonce: &[u8; 32]) -> (BuiltCommitment, SubtreeProof, [u8; 32]) { + let (pk, sk) = ml_dsa_65().generate_keypair().unwrap(); + let peer_id = *blake3::hash(&pk.to_bytes()).as_bytes(); + let pk_b = pk.to_bytes(); + let entries: Vec<_> = (0..n) + .map(|i| { + let k = key(i); + (k, *blake3::hash(&chunk_bytes(&k)).as_bytes()) + }) + .collect(); + let built = BuiltCommitment::build(entries, &peer_id, &sk, &pk_b).unwrap(); + let proof = + build_subtree_proof(built.tree(), nonce, &peer_id, |k| Some(chunk_bytes(k))).unwrap(); + (built, proof, peer_id) + } + + /// Round-1 verdict against the pinned commitment. + fn structure( + built: &BuiltCommitment, + proof: &SubtreeProof, + nonce: &[u8; 32], + peer: &[u8; 32], + ) -> Result<(), AuditFailureReason> { + evaluate_subtree_structure(built.commitment(), proof, nonce, &built.hash(), peer) + } + + /// The 3..=5 spot-check leaves the auditor would demand bytes for in round 2. + fn sample<'a>(proof: &'a SubtreeProof, nonce: &[u8; 32], n: u32) -> Vec<&'a SubtreeLeaf> { + spotcheck_leaves( + proof, + nonce, + n, + 8u32.clamp(BYTE_SPOTCHECK_MIN, BYTE_SPOTCHECK_MAX), + ) + } + + // A round-2 `served` closure that returns the HONEST content for every key. + // The nested-Option shape is the `verify_byte_response` callback contract: + // Present{bytes} -> Some(Some(bytes)); Absent -> Some(None); omitted -> None. + #[allow(clippy::option_option, clippy::unnecessary_wraps)] + fn served_honest(key: &XorName) -> Option>> { + Some(Some(chunk_bytes(key))) + } + + // ---- round 1: structure -------------------------------------------------- + + #[test] + fn honest_structure_then_bytes_passes() { + let nonce = [9u8; 32]; + let (built, proof, peer) = honest(400, &nonce); + // Round 1. + assert!(structure(&built, &proof, &nonce, &peer).is_ok()); + // Round 2: honest responder serves the real content for the sample. + let s = sample(&proof, &nonce, built.commitment().key_count); + assert!(!s.is_empty()); + match verify_byte_response(&s, &nonce, &peer, served_honest) { + AuditVerdict::Pass { checked } => assert!(checked >= 1, "must verify >=1 leaf"), + other @ AuditVerdict::Fail(_) => panic!("expected Pass, got {other:?}"), + } + } + + #[test] + fn commitment_bound_to_another_peer_rejected() { + let nonce = [3u8; 32]; + let (built, proof, _peer) = honest(200, &nonce); + let other = [0xAAu8; 32]; + assert_eq!( + structure(&built, &proof, &nonce, &other), + Err(AuditFailureReason::Rejected) + ); + } + + #[test] + fn wrong_pinned_commitment_rejected() { + let nonce = [3u8; 32]; + let (built, proof, peer) = honest(200, &nonce); + let mut wrong_pin = built.hash(); + wrong_pin[0] ^= 0x01; + assert_eq!( + evaluate_subtree_structure(built.commitment(), &proof, &nonce, &wrong_pin, &peer), + Err(AuditFailureReason::Rejected) + ); + } + + #[test] + fn tampered_leaf_structure_rejected() { + let nonce = [3u8; 32]; + let (built, mut proof, peer) = honest(200, &nonce); + if let Some(first) = proof.leaves.first_mut() { + first.bytes_hash[0] ^= 0x01; // breaks root reconstruction + } + assert_eq!( + structure(&built, &proof, &nonce, &peer), + Err(AuditFailureReason::DigestMismatch) + ); + } + + #[test] + fn wrong_leaf_count_structure_rejected() { + let nonce = [3u8; 32]; + let (built, mut proof, peer) = honest(200, &nonce); + proof.leaves.pop(); + assert_eq!( + structure(&built, &proof, &nonce, &peer), + Err(AuditFailureReason::DigestMismatch) + ); + } + + // ---- round 2: responder-served bytes ------------------------------------ + + #[test] + fn deleter_absent_bytes_is_confirmed_failure() { + // THE headline fix: a node whose round-1 proof is structurally perfect + // but which has DELETED a committed chunk cannot serve its bytes. It + // signals `Absent` for the sampled key → provable lie → confirmed + // failure. Crucially, the auditor holds NONE of the peer's chunks; the + // verdict depends only on what the responder serves. + let nonce = [9u8; 32]; + let (built, proof, peer) = honest(400, &nonce); + assert!(structure(&built, &proof, &nonce, &peer).is_ok()); + let s = sample(&proof, &nonce, built.commitment().key_count); + // Responder returns Absent for the FIRST sampled key, honest for the rest. + let victim = s.first().map(|l| l.key).unwrap(); + let v = verify_byte_response(&s, &nonce, &peer, |k| { + if *k == victim { + Some(None) // explicit Absent + } else { + Some(Some(chunk_bytes(k))) + } + }); + assert_eq!(v, AuditVerdict::Fail(AuditFailureReason::DigestMismatch)); + } + + #[test] + fn omitted_committed_key_is_confirmed_failure() { + // A responder that simply omits a sampled committed key from its items + // (neither Present nor Absent) is treated identically to Absent: it + // committed to the key and won't serve it → confirmed failure. + let nonce = [9u8; 32]; + let (built, proof, peer) = honest(400, &nonce); + let s = sample(&proof, &nonce, built.commitment().key_count); + let victim = s.first().map(|l| l.key).unwrap(); + let v = verify_byte_response(&s, &nonce, &peer, |k| { + if *k == victim { + None // omitted entirely + } else { + Some(Some(chunk_bytes(k))) + } + }); + assert_eq!(v, AuditVerdict::Fail(AuditFailureReason::DigestMismatch)); + } + + #[test] + fn fake_storage_garbage_bytes_is_confirmed_failure() { + // A "fake-storage" responder claims possession but serves garbage. The + // garbage does not hash to the committed content address (`bytes_hash`), + // so the round-2 content-address check fails → confirmed failure. No + // auditor holdings involved. + let nonce = [9u8; 32]; + let (built, proof, peer) = honest(400, &nonce); + let s = sample(&proof, &nonce, built.commitment().key_count); + let v = verify_byte_response(&s, &nonce, &peer, |k| { + let mut garbage = blake3::hash(k).as_bytes().to_vec(); + garbage.extend_from_slice(b"adversary-fake-storage"); + Some(Some(garbage)) + }); + assert_eq!(v, AuditVerdict::Fail(AuditFailureReason::DigestMismatch)); + } + + #[test] + fn correct_content_address_but_stale_freshness_fails() { + // Suppose a responder could serve bytes that hash to the content address + // (it holds the chunk) — then BOTH checks pass; that is honest. But if + // it serves bytes whose freshness hash does not match (e.g. replaying a + // different nonce's digest is impossible since we recompute it here), the + // freshness check must catch any content that doesn't reproduce the + // committed `nonced_hash`. We model a leaf whose committed nonced_hash was + // built under a DIFFERENT nonce, so the audit nonce's recompute differs. + let nonce = [9u8; 32]; + let (built, mut proof, peer) = honest(400, &nonce); + // Rewrite the first leaf's nonced_hash to one bound to a different nonce + // but keep its bytes_hash correct (so structure for THAT leaf's content + // address is fine; only freshness is wrong). + let other_nonce = [0xEEu8; 32]; + let s_keys: Vec = sample(&proof, &nonce, built.commitment().key_count) + .iter() + .map(|l| l.key) + .collect(); + let victim = s_keys.first().copied().unwrap(); + for leaf in &mut proof.leaves { + if leaf.key == victim { + leaf.nonced_hash = + nonced_leaf_hash(&other_nonce, &peer, &leaf.key, &chunk_bytes(&leaf.key)); + } + } + // Re-sample against the (now tampered) proof; serve honest content. + let s = sample(&proof, &nonce, built.commitment().key_count); + let v = verify_byte_response(&s, &nonce, &peer, served_honest); + assert_eq!(v, AuditVerdict::Fail(AuditFailureReason::DigestMismatch)); + } + + #[test] + fn auditor_holds_nothing_still_catches_deleter() { + // Explicit contract: the auditor's own storage is irrelevant. A deleter + // is caught purely from its served (absent) response. (Compare the OLD + // design, where an auditor holding none of the chunks went Inconclusive + // and the deleter walked free.) + let nonce = [0x21u8; 32]; + let (built, proof, peer) = honest(256, &nonce); + assert!(structure(&built, &proof, &nonce, &peer).is_ok()); + let s = sample(&proof, &nonce, built.commitment().key_count); + // Responder is a total deleter: Absent for everything. + let v = verify_byte_response(&s, &nonce, &peer, |_| Some(None)); + assert_eq!(v, AuditVerdict::Fail(AuditFailureReason::DigestMismatch)); + } + + #[test] + fn sample_size_is_in_3_to_5_band() { + // ADR-0002: round-2 samples a SMALL surprise set (3..=5) of the proven + // leaves. For a large subtree the sample is capped at 5. + let nonce = [7u8; 32]; + let (built, proof, _peer) = honest(1024, &nonce); + let s = sample(&proof, &nonce, built.commitment().key_count); + assert!( + (BYTE_SPOTCHECK_MIN as usize..=BYTE_SPOTCHECK_MAX as usize).contains(&s.len()), + "sample {} must be within 3..=5", + s.len() + ); + } + + #[test] + fn full_pass_requires_every_sampled_leaf() { + // checked must equal the number of sampled leaves on a pass (no leaf is + // silently skipped — every sampled, committed key must verify). + let nonce = [11u8; 32]; + let (built, proof, peer) = honest(400, &nonce); + let s = sample(&proof, &nonce, built.commitment().key_count); + match verify_byte_response(&s, &nonce, &peer, served_honest) { + AuditVerdict::Pass { checked } => assert_eq!(checked, s.len()), + other @ AuditVerdict::Fail(_) => panic!("expected Pass, got {other:?}"), + } + } + + // ---- end-to-end gate composition ---------------------------------------- + + #[test] + fn structure_fail_short_circuits_before_round_2() { + // A structurally invalid proof is rejected in round 1; the byte challenge + // is never issued. We assert the round-1 gate returns Err so the auditor + // (verify_subtree_response) never reaches request_byte_proof. + let nonce = [5u8; 32]; + let (built, mut proof, peer) = honest(300, &nonce); + if let Some(first) = proof.leaves.first_mut() { + first.bytes_hash[0] ^= 0x01; + } + assert!(structure(&built, &proof, &nonce, &peer).is_err()); + } + + /// Build an honest committed tree whose keys are deliberately "FAR": their + /// addresses live at the high end of the XOR space (top bytes = 0xFF). On the + /// auditor side these are the leaves `observe_closeness` counts toward `far`. + fn honest_far(n: u32, nonce: &[u8; 32]) -> (BuiltCommitment, SubtreeProof, [u8; 32]) { + let (pk, sk) = ml_dsa_65().generate_keypair().unwrap(); + let peer_id = *blake3::hash(&pk.to_bytes()).as_bytes(); + let pk_b = pk.to_bytes(); + let entries: Vec<_> = (0..n) + .map(|i| { + let mut k = [0xFFu8; 32]; + k[28..].copy_from_slice(&i.to_be_bytes()); + (k, *blake3::hash(&chunk_bytes(&k)).as_bytes()) + }) + .collect(); + let built = BuiltCommitment::build(entries, &peer_id, &sk, &pk_b).unwrap(); + let proof = + build_subtree_proof(built.tree(), nonce, &peer_id, |k| Some(chunk_bytes(k))).unwrap(); + (built, proof, peer_id) + } + + /// ADR-0002 "Closeness" is OBSERVE-ONLY: far-keyed honest proofs verify + /// exactly like near-keyed ones. The verdict (structure + served bytes) is + /// closeness-blind, so a "far/padding" shape can never produce a Fail. + #[test] + fn closeness_is_observe_only_far_keys_still_pass() { + let nonce = [9u8; 32]; + + let (built_far, proof_far, peer_far) = honest_far(400, &nonce); + assert!(structure(&built_far, &proof_far, &nonce, &peer_far).is_ok()); + let sf = sample(&proof_far, &nonce, built_far.commitment().key_count); + let v_far = verify_byte_response(&sf, &nonce, &peer_far, served_honest); + + let (built_near, proof_near, peer_near) = honest(400, &nonce); + assert!(structure(&built_near, &proof_near, &nonce, &peer_near).is_ok()); + let sn = sample(&proof_near, &nonce, built_near.commitment().key_count); + let v_near = verify_byte_response(&sn, &nonce, &peer_near, served_honest); + + match (&v_far, &v_near) { + (AuditVerdict::Pass { checked: cf }, AuditVerdict::Pass { checked: cn }) => { + assert!(*cf >= 1 && *cn >= 1); + } + other => panic!("both honest proofs must Pass regardless of closeness, got {other:?}"), + } + assert!( + !matches!(v_far, AuditVerdict::Fail(_)), + "far/padding-shaped honest proof must NEVER fail, got {v_far:?}" + ); + } + + // Unused-leaf constructor guard: keep SubtreeLeaf import meaningful. + #[test] + fn subtree_leaf_is_constructible() { + let _l = SubtreeLeaf { + key: key(1), + bytes_hash: [0u8; 32], + nonced_hash: [0u8; 32], + }; + } +} diff --git a/src/replication/subtree.rs b/src/replication/subtree.rs new file mode 100644 index 00000000..2423bec8 --- /dev/null +++ b/src/replication/subtree.rs @@ -0,0 +1,1086 @@ +//! Gossip-triggered contiguous-subtree storage proof (ADR-0002). +//! +//! Pure, network-free core of the audit redesign. Given a peer's signed +//! [`StorageCommitment`] and an auditor-chosen random nonce, both sides +//! deterministically select **one contiguous subtree** of the committed +//! Merkle tree; the responder expands that subtree to its leaves plus the +//! sibling cut-hashes on the path to the root; the auditor rebuilds the root +//! and spot-checks a few leaves against real chunk bytes. +//! +//! Three independent checks (ADR-0002 "Verification, three independent +//! checks"); this module owns the first two — the third (response deadline) +//! is enforced by the caller: +//! +//! 1. **Structure** — [`verify_subtree_proof`] re-derives the selected branch +//! from `(nonce, key_count)`, rebuilds the root from the returned leaves and +//! cut-hashes, and requires it to equal the pinned root. +//! 2. **Real bytes** — [`select_spotcheck_indices`] picks a few leaves within +//! the subtree; the caller fetches their bytes and checks both the plain +//! content hash and the nonce freshness hash. Faking a fraction `x` of +//! leaves survives only `(1 - x)^k`. +//! +//! ## Tree geometry (must match [`super::commitment::MerkleTree`]) +//! +//! Leaves are sorted by key and fill positions `0..N`. The tree is +//! left-packed: when a level has an odd number of nodes the last node is +//! paired with itself (`node_hash(x, x)`). There are no explicit padding +//! leaves; "padding" is the empty right side of a subtree slot that extends +//! past `N`. Depth `D = ceil(log2(N))`. A node identified by `(depth, slot)` +//! (depth measured from the root, slot in `0..2^depth`) covers the contiguous +//! leaf range `[slot * span, (slot + 1) * span)` where `span = 2^(D - depth)`, +//! intersected with `0..N`. + +use super::commitment::{leaf_hash, node_hash, StorageCommitment, MAX_COMMITMENT_KEY_COUNT}; +use super::protocol::compute_audit_digest; +use crate::ant_protocol::XorName; +use serde::{Deserialize, Serialize}; + +/// Below this key count the whole tree is challenged; `sqrt` rounding is +/// meaningless for tiny trees and a full proof is cheap. +pub const SMALL_TREE_FULL_AUDIT_FLOOR: u32 = 4; + +/// One leaf of the selected subtree, as returned by the responder. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SubtreeLeaf { + /// The committed key (chunk address) at this leaf position. + pub key: XorName, + /// `BLAKE3(record_bytes)` — the plain content hash. This is also the + /// chunk's network address, so it is public; possessing it does NOT prove + /// possession of the bytes (that is what `nonced_hash` is for). + pub bytes_hash: [u8; 32], + /// `compute_audit_digest(nonce, peer_id, key, record_bytes)` — the + /// freshness hash. Only a holder of the actual bytes can produce it for a + /// fresh nonce, so a spot-check on it proves real possession. + pub nonced_hash: [u8; 32], +} + +/// A responder's single-contiguous-subtree proof (ADR-0002 "The proof"). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SubtreeProof { + /// Every leaf of the selected subtree, in ascending leaf-index order. + pub leaves: Vec, + /// One sibling cut-hash per level on the path from the root down to the + /// selected subtree root, ordered root-first. Each is the plain hash of + /// the unselected sibling node at that level. + pub sibling_cut_hashes: Vec<[u8; 32]>, +} + +/// The deterministically-selected contiguous subtree, derived from +/// `(nonce, key_count)` and agreed by both sides. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SubtreePath { + /// Depth of the subtree root below the tree root (0 = whole tree). + pub depth: u32, + /// Slot index of the subtree root within its level, in `0..2^depth`. + pub slot: u32, + /// First real leaf index covered (inclusive). + pub leaf_start: u32, + /// One past the last real leaf index covered (exclusive). Always + /// `leaf_end > leaf_start`, so the selection never covers zero real + /// leaves — this is the ADR's dead-block fix. + pub leaf_end: u32, +} + +impl SubtreePath { + /// Number of real (non-padding) leaves in the selected subtree. + #[must_use] + pub fn real_leaf_count(&self) -> u32 { + self.leaf_end - self.leaf_start + } +} + +/// Tree depth `D = ceil(log2(key_count))`, matching `MerkleTree` / `verify_path`. +/// +/// `key_count == 1` → depth 0 (the single leaf is the root). Returns `None` +/// for an out-of-protocol `key_count` so callers reject it before any work. +#[must_use] +fn tree_depth(key_count: u32) -> Option { + if key_count == 0 || key_count > MAX_COMMITMENT_KEY_COUNT { + return None; + } + // checked_next_power_of_two cannot fail under the cap above, but the + // explicit check keeps behaviour identical across debug/release. + let rounded = key_count.checked_next_power_of_two()?; + Some(rounded.trailing_zeros()) +} + +/// Count real leaves under the node at `(depth, slot)` for a tree of `key_count` +/// leaves. Pure function of geometry — identical on auditor and responder. +/// +/// `span = 2^(total_depth - depth)`; the node covers `[slot*span, (slot+1)*span)` +/// clamped to `0..key_count`. +#[must_use] +fn real_leaves_under(depth: u32, slot: u64, key_count: u32, total_depth: u32) -> u32 { + let levels_below = total_depth - depth; + // span fits in u64: total_depth <= 20 for key_count <= 1e6. + let span = 1u64 << levels_below; + let start = slot.saturating_mul(span).min(u64::from(key_count)); + let end = slot + .saturating_add(1) + .saturating_mul(span) + .min(u64::from(key_count)); + // end >= start always; difference fits in u32 (<= key_count). + u32::try_from(end - start).unwrap_or(0) +} + +/// `ceil(sqrt(key_count))` — the real-leaf floor a selected subtree must meet. +#[must_use] +fn sqrt_floor(key_count: u32) -> u32 { + // Exact integer ceil(sqrt(n)), float-free and MSRV-safe (no u64::isqrt). + // Newton's method converges to floor(sqrt(n)); then round up unless n is a + // perfect square. Always at least 1. + let n = u64::from(key_count); + if n <= 1 { + return 1; + } + let mut x = n; + let mut y = x.div_ceil(2); + while y < x { + x = y; + y = (x + n / x) / 2; + } + // x == floor(sqrt(n)) here. + let ceil = if x.saturating_mul(x) == n { x } else { x + 1 }; + u32::try_from(ceil.max(1)).unwrap_or(u32::MAX) +} + +/// Read bit `index` of the nonce (bit 0 = MSB of byte 0), `index` 0-based. +/// +/// `1 → left child, 0 → right child` (ADR). With a 256-bit nonce and a tree +/// depth ≤ 20 we never run out of bits. +#[must_use] +fn nonce_bit(nonce: &[u8; 32], index: u32) -> bool { + let byte = (index / 8) as usize; + let bit = 7 - (index % 8); + // byte < 32 because index < 256 for any reachable depth; guard anyway. + nonce.get(byte).is_some_and(|b| (b >> bit) & 1 == 1) +} + +/// Deterministically select one contiguous subtree from `(nonce, key_count)`. +/// +/// Walks the nonce bits from the root, descending into the child the bit picks, +/// and **stops at the smallest branch whose real-leaf count is still ≥ +/// `ceil(sqrt(key_count))`**. Because an all-padding child has zero real leaves +/// (< the floor), the walk never descends into one — so the selection always +/// covers ≥ `sqrt` real leaves and is never empty (ADR dead-block fix). +/// +/// For `key_count <= SMALL_TREE_FULL_AUDIT_FLOOR` the whole tree is selected. +/// +/// Returns `None` only for an out-of-protocol `key_count` (caller rejects). +#[must_use] +pub fn select_subtree_path(nonce: &[u8; 32], key_count: u32) -> Option { + let total_depth = tree_depth(key_count)?; + + // Tiny trees: challenge everything. + if key_count <= SMALL_TREE_FULL_AUDIT_FLOOR { + return Some(SubtreePath { + depth: 0, + slot: 0, + leaf_start: 0, + leaf_end: key_count, + }); + } + + let floor = sqrt_floor(key_count); + let mut depth = 0u32; + let mut slot = 0u64; // slot within the current level + + // Descend while the chosen child still meets the floor. + while depth < total_depth { + let go_left = nonce_bit(nonce, depth); + // 1 = left child (bit set), 0 = right child. Right child is the odd slot. + let child_slot = slot * 2 + u64::from(!go_left); + let child_real = real_leaves_under(depth + 1, child_slot, key_count, total_depth); + if child_real < floor { + break; // descending would drop below the floor → stay here + } + depth += 1; + slot = child_slot; + } + + let span = 1u64 << (total_depth - depth); + let leaf_start = + u32::try_from(slot.saturating_mul(span).min(u64::from(key_count))).unwrap_or(key_count); + let leaf_end = u32::try_from( + slot.saturating_add(1) + .saturating_mul(span) + .min(u64::from(key_count)), + ) + .unwrap_or(key_count); + + Some(SubtreePath { + depth, + slot: u32::try_from(slot).unwrap_or(u32::MAX), + leaf_start, + leaf_end, + }) +} + +/// Pick `k` distinct nonce-random leaf positions within the selected subtree. +/// +/// Returned as indices into `path.real_leaf_count()` (0-based within the +/// subtree). Used for the real-bytes spot-check (ADR-0002). Deterministic from +/// the nonce so the auditor and any observer derive the same positions; the +/// responder cannot predict-and-fake only these because it must produce a +/// correct nonced hash for *every* returned leaf anyway — the spot-check just +/// bounds how many it can fake and still pass. +#[must_use] +pub fn select_spotcheck_indices(nonce: &[u8; 32], path: &SubtreePath, k: u32) -> Vec { + let n = path.real_leaf_count(); + if n == 0 { + return Vec::new(); + } + if n <= k { + return (0..n).collect(); + } + // Derive a stream of indices by hashing (nonce || counter) and reducing + // mod n; skip collisions. Bounded: k is small (clamped to the 3..=5 band) + // and n > k. + let mut out: Vec = Vec::with_capacity(k as usize); + let mut counter: u32 = 0; + while u32::try_from(out.len()).unwrap_or(u32::MAX) < k { + let mut h = blake3::Hasher::new(); + h.update(b"autonomi.ant.replication.audit_spotcheck.v1"); + h.update(nonce); + h.update(&counter.to_le_bytes()); + let digest = *h.finalize().as_bytes(); + let mut word = [0u8; 4]; + word.copy_from_slice(&digest[..4]); + let idx = u32::from_le_bytes(word) % n; + if !out.contains(&idx) { + out.push(idx); + } + counter = counter.wrapping_add(1); + // Bound the hash stream (vanishingly unlikely to bite with n > k). + if counter > k.saturating_mul(64) { + break; + } + } + // Top up deterministically if the bounded hash stream collided too often: + // take the lowest indices not yet selected. Still nonce-independent only in + // this (astronomically rare) tail, and identical on every observer — the + // caller is guaranteed exactly min(k, n) distinct indices, so the byte + // sample is never silently smaller than requested. + let mut candidate: u32 = 0; + while u32::try_from(out.len()).unwrap_or(u32::MAX) < k && candidate < n { + if !out.contains(&candidate) { + out.push(candidate); + } + candidate += 1; + } + out +} + +/// Verdict from [`verify_subtree_proof`]'s structural check. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum StructureVerdict { + /// Proof is well-formed and its root matches the pinned commitment. + Valid, + /// Proof is malformed or its root does not match. Carries a static reason + /// for logging; all variants are confirmed failures, not benign. + Invalid(&'static str), +} + +/// Structural verification (ADR-0002 check 1): the returned subtree genuinely +/// belongs to the committed tree. +/// +/// Re-derives the selected branch from `(nonce, commitment.key_count)`, +/// rebuilds the root from `proof.leaves` and `proof.sibling_cut_hashes`, and +/// requires it to equal `commitment.root`. Also checks leaf count and +/// ascending-key order (the committed tree sorts leaves by key). +/// +/// This does NOT verify possession of bytes — that is the caller's spot-check +/// using [`select_spotcheck_indices`]. It only proves the structure. +#[must_use] +pub fn verify_subtree_proof( + proof: &SubtreeProof, + nonce: &[u8; 32], + commitment: &StorageCommitment, +) -> StructureVerdict { + let Some(path) = select_subtree_path(nonce, commitment.key_count) else { + return StructureVerdict::Invalid("out-of-protocol key_count"); + }; + + // Leaf count must equal the agreed subtree's real-leaf count exactly. + let expected_leaves = path.real_leaf_count() as usize; + if proof.leaves.len() != expected_leaves { + return StructureVerdict::Invalid("wrong leaf count"); + } + // Sibling cut-hashes: one per level on the path to the subtree root. + if proof.sibling_cut_hashes.len() != path.depth as usize { + return StructureVerdict::Invalid("wrong cut-hash count"); + } + + // Leaves must be strictly ascending by key (matches MerkleTree sort), which + // also rejects duplicates. + for w in proof.leaves.windows(2) { + if let [a, b] = w { + if a.key >= b.key { + return StructureVerdict::Invalid("leaves not strictly ascending"); + } + } + } + + // Out-of-protocol key_count cannot happen here (select_subtree_path already + // returned Some), but recompute total_depth defensively for the climb maths. + let Some(total_depth) = tree_depth(commitment.key_count) else { + return StructureVerdict::Invalid("out-of-protocol key_count"); + }; + + // Phase A — reconstruct the selected subtree's root NODE exactly as the + // committed tree's level-by-level build produces it. The subtree root sits + // at `(level_from_leaves, slot)`, covering a left-packed block of leaves; + // folding that block up `level_from_leaves` levels with the same + // self-pair-the-last-node rule as `MerkleTree::build_next_level` yields the + // identical node (including the `node_hash(x, x)` self-pair when the block + // is the tree's odd tail at some level). `fold_to_root` stopped at a single + // hash and so skipped the self-pair when a truncated block reached length 1 + // before climbing all the way to the subtree-root level — the geometry bug. + let leaf_hashes: Vec<[u8; 32]> = proof + .leaves + .iter() + .map(|l| leaf_hash(&l.key, &l.bytes_hash)) + .collect(); + let levels_to_subtree_root = total_depth - path.depth; + let mut cur = fold_levels(leaf_hashes, levels_to_subtree_root); + + // Phase B — climb from the subtree root to the tree root using one sibling + // cut-hash per level, exactly like `verify_path`: the climb's left/right + // choice is the real node-index parity, NOT a nonce bit, and the self-pair + // of an odd level's last node falls out naturally when the builder supplied + // the chosen node itself as its own sibling. The cut-hashes are root-first, + // so we consume them in reverse (lowest climb step uses the last cut-hash). + // + // We recompute the node index of the subtree root the same way the builder + // walked the nonce bits, then halve it as we climb — mirroring `verify_path`. + let mut node_index = u64::from(path.slot); + for level_above in (0..path.depth).rev() { + let Some(sibling) = proof.sibling_cut_hashes.get(level_above as usize) else { + return StructureVerdict::Invalid("missing cut-hash"); + }; + cur = if node_index % 2 == 0 { + node_hash(&cur, sibling) + } else { + node_hash(sibling, &cur) + }; + node_index /= 2; + } + + if cur == commitment.root { + StructureVerdict::Valid + } else { + StructureVerdict::Invalid("root mismatch") + } +} + +/// Fold a contiguous, left-aligned block of node hashes up exactly `levels` +/// levels, applying the same left-packed self-pair rule as +/// `MerkleTree::build_next_level` (`node_hash(x, x)` for an unpaired last node). +/// +/// This is the generalisation of a single-leaf inclusion fold to a *range* of +/// leaves: a subtree root at `(levels, slot)` covers a block whose left edge is +/// pair-aligned at every sub-level, so the only odd run that can occur is the +/// tree's genuine odd tail — exactly when `build_next_level` self-pairs. Folding +/// the block `levels` times therefore reproduces the committed node bit-for-bit, +/// including the self-pair that `fold_to_root` used to skip by stopping at a +/// single hash too early. +/// +/// `levels == 0` returns the block's single element unchanged (the subtree IS +/// the tree, e.g. the small-tree full-audit case after its own folds, or a +/// single-leaf tree). An empty input is impossible here (callers guarantee ≥ 1 +/// leaf via the dead-block fix); returns a zero hash defensively. +#[must_use] +fn fold_levels(mut level: Vec<[u8; 32]>, levels: u32) -> [u8; 32] { + if level.is_empty() { + return [0u8; 32]; + } + for _ in 0..levels { + let mut next = Vec::with_capacity(level.len().div_ceil(2)); + let mut i = 0; + while i < level.len() { + let left = level[i]; + // Missing right sibling → self-pair the last node, identical to + // `build_next_level`. Within a selected block this happens only at + // the tree's odd tail, so it matches the committed build exactly. + let right = level.get(i + 1).copied().unwrap_or(left); + next.push(node_hash(&left, &right)); + i += 2; + } + level = next; + } + // After `levels` folds of a `2^levels`-span left-aligned block, exactly one + // node remains; defensively fall back if the block was shorter. + level.first().copied().unwrap_or([0u8; 32]) +} + +/// Build the per-leaf nonced freshness hash for a subtree leaf (responder +/// side), reusing the existing audit digest. +#[must_use] +pub fn nonced_leaf_hash( + nonce: &[u8; 32], + challenged_peer_id: &[u8; 32], + key: &XorName, + record_bytes: &[u8], +) -> [u8; 32] { + compute_audit_digest(nonce, challenged_peer_id, key, record_bytes) +} + +/// Why a responder could not build a subtree proof for a challenge. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BuildProofError { + /// The challenge's `key_count` (from the pinned commitment) is out of + /// protocol range. Should never happen for a commitment we built. + BadKeyCount, + /// A selected leaf's key could not be resolved from the tree (internal + /// inconsistency; should never happen). + MissingKey { + /// The leaf index that could not be resolved. + leaf_index: u32, + }, + /// The responder no longer holds the bytes for a selected, committed key. + /// This is real storage loss / deliberate non-response — the caller turns + /// it into a confirmed audit failure, NOT a benign rejection. + MissingBytes { + /// The committed key whose bytes are gone. + key: XorName, + }, +} + +/// Build the single-contiguous-subtree proof for `(nonce, tree)` (responder). +/// +/// `bytes_for(&key)` returns the chunk bytes the responder holds for a key, or +/// `None` if it cannot read them. Walks the same nonce-selected path the +/// auditor will re-derive, reads the unselected sibling cut-hashes directly +/// from the committed tree (so they are provably consistent with the gossiped +/// root), and builds each selected leaf's plain and nonced hashes from the real +/// bytes. +/// +/// # Errors +/// +/// See [`BuildProofError`]. `MissingBytes` is the one the caller penalises; +/// the others indicate an internal inconsistency. +pub fn build_subtree_proof( + tree: &super::commitment::MerkleTree, + nonce: &[u8; 32], + challenged_peer_id: &[u8; 32], + bytes_for: impl Fn(&XorName) -> Option>, +) -> Result { + let plan = subtree_plan(tree, nonce)?; + let mut leaves = Vec::with_capacity(plan.leaf_keys.len()); + for key in &plan.leaf_keys { + let bytes = bytes_for(key).ok_or(BuildProofError::MissingBytes { key: *key })?; + leaves.push(subtree_leaf(nonce, challenged_peer_id, key, &bytes)); + } + Ok(SubtreeProof { + leaves, + sibling_cut_hashes: plan.sibling_cut_hashes, + }) +} + +/// The pure (no-bytes) geometry of a subtree proof. +/// +/// Holds the ordered keys whose bytes the responder must hash and the sibling +/// cut-hashes read from the tree. Splitting this out lets an async responder +/// read chunk bytes per leaf without forcing the tree-walking maths to be async. +#[derive(Debug, Clone)] +pub struct SubtreePlan { + /// The selected leaves' keys, in ascending leaf-index order. + pub leaf_keys: Vec, + /// One sibling cut-hash per level on the path to the subtree root, + /// root-first. + pub sibling_cut_hashes: Vec<[u8; 32]>, +} + +/// Compute the [`SubtreePlan`] for `(nonce, tree)` — selection geometry only, +/// no chunk bytes touched. +/// +/// # Errors +/// +/// [`BuildProofError::BadKeyCount`] for an out-of-protocol tree; +/// [`BuildProofError::MissingKey`] if a selected leaf index is not in the tree +/// (internal inconsistency). +pub fn subtree_plan( + tree: &super::commitment::MerkleTree, + nonce: &[u8; 32], +) -> Result { + let key_count = tree.key_count(); + let path = select_subtree_path(nonce, key_count).ok_or(BuildProofError::BadKeyCount)?; + + let mut leaf_keys = Vec::with_capacity(path.real_leaf_count() as usize); + for idx in path.leaf_start..path.leaf_end { + let key = tree + .key_at(idx as usize) + .ok_or(BuildProofError::MissingKey { leaf_index: idx })?; + leaf_keys.push(key); + } + + // Sibling cut-hashes, root-first. At descent step `d` (0-based from the + // root), the chosen child is on the side the nonce bit picks; the sibling + // is the other child at level `total_depth - (d + 1)` (counting up from + // leaves). On an odd-length level the missing sibling self-pairs, i.e. the + // sibling hash is the chosen node itself. + let total_depth = u32::try_from(tree.levels_count().saturating_sub(1)).unwrap_or(0); + let mut sibling_cut_hashes = Vec::with_capacity(path.depth as usize); + let mut slot = 0u64; + for d in 0..path.depth { + let go_left = nonce_bit(nonce, d); + let child = slot * 2 + u64::from(!go_left); + let sibling = child ^ 1; + let level_from_leaves = (total_depth - (d + 1)) as usize; + let chosen_hash = tree.node_at(level_from_leaves, child); + let sib_hash = tree + .node_at(level_from_leaves, sibling) + .or(chosen_hash) + .ok_or(BuildProofError::BadKeyCount)?; + sibling_cut_hashes.push(sib_hash); + slot = child; + } + + Ok(SubtreePlan { + leaf_keys, + sibling_cut_hashes, + }) +} + +/// Build one subtree leaf from its key and the chunk bytes the responder holds. +#[must_use] +pub fn subtree_leaf( + nonce: &[u8; 32], + challenged_peer_id: &[u8; 32], + key: &XorName, + bytes: &[u8], +) -> SubtreeLeaf { + SubtreeLeaf { + key: *key, + bytes_hash: *blake3::hash(bytes).as_bytes(), + nonced_hash: nonced_leaf_hash(nonce, challenged_peer_id, key, bytes), + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::*; + use crate::replication::commitment::MerkleTree; + + fn xn_u32(i: u32) -> XorName { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_be_bytes()); // big-endian so numeric order == sort order + k + } + fn nonce_of(seed: u8) -> [u8; 32] { + [seed; 32] + } + + // ---- sqrt_floor ------------------------------------------------------- + + #[test] + fn sqrt_floor_is_exact_ceil() { + assert_eq!(sqrt_floor(1), 1); + assert_eq!(sqrt_floor(4), 2); + assert_eq!(sqrt_floor(5), 3); // ceil(sqrt(5)) = 3 + assert_eq!(sqrt_floor(9), 3); + assert_eq!(sqrt_floor(10), 4); + assert_eq!(sqrt_floor(100), 10); + assert_eq!(sqrt_floor(101), 11); + assert_eq!(sqrt_floor(1_000_000), 1000); + } + + // ---- real_leaves_under ------------------------------------------------ + + #[test] + fn real_leaves_under_root_is_all() { + let d = tree_depth(100).unwrap(); + assert_eq!(real_leaves_under(0, 0, 100, d), 100); + } + + #[test] + fn real_leaves_under_padding_slot_is_zero() { + // key_count = 5, total_depth = 3 (next_pow2(5)=8). Leaf slots 5,6,7 + // at the bottom are padding. The right half at depth 1 (slot 1) covers + // leaves [4,8) → only leaf 4 is real. + let d = tree_depth(5).unwrap(); + assert_eq!(d, 3); + assert_eq!(real_leaves_under(1, 0, 5, d), 4); // [0,4) + assert_eq!(real_leaves_under(1, 1, 5, d), 1); // [4,8) ∩ [0,5) = {4} + assert_eq!(real_leaves_under(3, 7, 5, d), 0); // pure padding leaf + assert_eq!(real_leaves_under(2, 3, 5, d), 0); // [6,8) pure padding + } + + // ---- select_subtree_path: dead-block regression ----------------------- + + #[test] + fn selection_never_empty_across_many_sizes_and_nonces() { + for n in [ + 5u32, 6, 7, 9, 13, 17, 33, 65, 100, 129, 333, 1000, 1024, 1025, + ] { + let floor = sqrt_floor(n); + for seed in 0u8..=255 { + let path = select_subtree_path(&nonce_of(seed), n).unwrap(); + assert!( + path.real_leaf_count() >= floor.min(n), + "n={n} seed={seed}: real={} < floor={floor}", + path.real_leaf_count() + ); + assert!( + path.real_leaf_count() >= 1, + "n={n} seed={seed}: empty selection" + ); + assert!(path.leaf_end <= n); + assert!(path.leaf_start < path.leaf_end); + } + } + } + + #[test] + fn small_trees_select_whole_tree() { + for n in 1..=SMALL_TREE_FULL_AUDIT_FLOOR { + let path = select_subtree_path(&nonce_of(7), n).unwrap(); + assert_eq!(path.depth, 0); + assert_eq!(path.leaf_start, 0); + assert_eq!(path.leaf_end, n); + } + } + + #[test] + fn selection_is_deterministic() { + let n = 500; + let a = select_subtree_path(&nonce_of(42), n).unwrap(); + let b = select_subtree_path(&nonce_of(42), n).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn different_nonces_cover_different_branches_over_time() { + // Not every nonce differs, but the set of selected ranges must be > 1. + let n = 1024; + let mut starts = std::collections::HashSet::new(); + for seed in 0u8..=255 { + let p = select_subtree_path(&nonce_of(seed), n).unwrap(); + starts.insert(p.leaf_start); + } + assert!( + starts.len() > 4, + "nonce should spread selection: {}", + starts.len() + ); + } + + /// Deterministic per-trial nonce (no RNG): hash a counter. + fn nonce_for_trial(i: u32) -> [u8; 32] { + let mut h = blake3::Hasher::new(); + h.update(b"detection-sim-trial"); + h.update(&i.to_le_bytes()); + *h.finalize().as_bytes() + } + + /// Catch rate over `trials` audits: fraction whose nonce-selected subtree + /// overlaps at least one deleted leaf index. + fn catch_rate(n: u32, deleted: &std::collections::HashSet, trials: u32) -> f64 { + let mut caught = 0u32; + for t in 0..trials { + let path = select_subtree_path(&nonce_for_trial(t), n).unwrap(); + if (path.leaf_start..path.leaf_end).any(|i| deleted.contains(&i)) { + caught += 1; + } + } + f64::from(caught) / f64::from(trials) + } + + #[test] + fn detection_uniform_fast_clustered_floor() { + // ADR-0002 Validation: uniform deletions are caught fast; clustered + // (contiguous-block) deletions are caught at roughly the deleted + // fraction per audit (a floor), much slower. This encodes the core + // security claim that the audit RATE (not per-audit cleverness) is the + // lever against a clustered deleter. + let n = 1024u32; // sqrt = 32 + let del_count = n / 10; // delete 10% ≈ 102 + + // Uniform: spread deletions evenly across the keyspace. + let uniform: std::collections::HashSet = + (0..del_count).map(|i| (i * n / del_count) % n).collect(); + let uniform_rate = catch_rate(n, &uniform, 256); + + // Clustered: one contiguous block of the same size. + let clustered: std::collections::HashSet = (0..del_count).collect(); + let clustered_rate = catch_rate(n, &clustered, 256); + + // Uniform should be caught on essentially every audit (spread across the + // whole tree; any selected subtree overlaps some deletion). + assert!( + uniform_rate > 0.95, + "uniform deletions should be caught almost every audit, got {uniform_rate}" + ); + // Clustered (one contiguous f-block) is a floor NEAR the deleted + // fraction f=0.1 — the quantitative ADR claim. The exact rate depends on + // selection geometry (a block of ~102 leaves is hit when the selected + // ~sqrt(N) subtree overlaps it), but it must sit in a tight band around + // f, well below the uniform rate. We bound it to [0.04, 0.30]. + assert!( + (0.04..=0.30).contains(&clustered_rate), + "clustered catch-rate should be near f=0.1, got {clustered_rate}" + ); + assert!( + uniform_rate > clustered_rate * 2.0, + "uniform ({uniform_rate}) must be far easier to catch than clustered ({clustered_rate})" + ); + } + + #[test] + fn subtree_size_near_sqrt_for_balanced_tree() { + // For a power-of-two tree the selection should land near sqrt(N). + let n = 1024; // sqrt = 32, floor = 32 + let path = select_subtree_path(&nonce_of(3), n).unwrap(); + // It stops as soon as a child would drop below floor; the subtree size + // is between floor and 2*floor for a balanced tree. + assert!(path.real_leaf_count() >= 32); + assert!( + path.real_leaf_count() <= 64, + "got {}", + path.real_leaf_count() + ); + } + + // ---- end-to-end proof build + verify ---------------------------------- + + /// Deterministic chunk bytes for a key (test fixture). The tree is built + /// from `BLAKE3` of exactly these bytes, so the proof and the committed + /// root agree — mirroring how a real responder hashes the chunk it holds. + fn chunk_bytes(key: &XorName) -> Vec { + // Distinct, non-trivial bytes derived from the key. + let mut v = key.to_vec(); + v.extend_from_slice(b"chunk-body"); + v + } + + /// Build tree entries `(key, BLAKE3(chunk_bytes(key)))` for `n` keys. + fn entries_for(n: u32) -> Vec<(XorName, [u8; 32])> { + (0..n) + .map(|i| { + let key = xn_u32(i); + let bytes_hash = *blake3::hash(&chunk_bytes(&key)).as_bytes(); + (key, bytes_hash) + }) + .collect() + } + + /// Reference responder: build a real subtree proof via the production + /// [`build_subtree_proof`] from a `MerkleTree` over `entries`. Leaves are + /// hashed from `chunk_bytes(key)` — the same bytes whose hash built the + /// tree — so an honest proof verifies. This makes the tests exercise the + /// exact builder the responder runs. + fn build_proof( + entries: &[(XorName, [u8; 32])], + nonce: &[u8; 32], + peer_id: &[u8; 32], + ) -> (SubtreeProof, StorageCommitment) { + let tree = MerkleTree::build(entries.to_vec()).unwrap(); + let key_count = tree.key_count(); + let proof = build_subtree_proof(&tree, nonce, peer_id, |k| Some(chunk_bytes(k))).unwrap(); + let commitment = fake_commitment(tree.root(), key_count, *peer_id); + (proof, commitment) + } + + fn fake_commitment(root: [u8; 32], key_count: u32, peer: [u8; 32]) -> StorageCommitment { + StorageCommitment { + root, + key_count, + sender_peer_id: peer, + sender_public_key: vec![0u8; 1952], + signature: vec![0u8; 3293], + } + } + + #[test] + fn honest_proof_verifies_at_many_sizes() { + let peer = [0xABu8; 32]; + for n in [5u32, 8, 13, 17, 64, 100, 256, 1000] { + let entries = entries_for(n); + for seed in [1u8, 2, 7, 42, 200] { + let nonce = nonce_of(seed); + let (proof, commitment) = build_proof(&entries, &nonce, &peer); + assert_eq!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Valid, + "n={n} seed={seed}" + ); + } + } + } + + #[test] + fn honest_proof_verifies_for_every_size_and_nonce() { + // Regression for the left-packed self-pairing geometry bug: the proof + // reconstruction must match the committed root for EVERY key count + // (not just powers of two / cherry-picked sizes) and every nonce. An + // earlier perfect-tree model false-failed honest nodes for ~70% of + // sizes; this guards against any reintroduction. + let peer = [7u8; 32]; + for n in 5u32..=600 { + let entries = entries_for(n); + for seed in 0u8..32 { + let nonce = nonce_of(seed.wrapping_mul(17).wrapping_add(3)); + let (proof, commitment) = build_proof(&entries, &nonce, &peer); + assert_eq!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Valid, + "honest proof must verify at n={n} seed={seed}" + ); + } + } + } + + #[test] + fn tampered_leaf_breaks_root() { + let peer = [9u8; 32]; + let entries = entries_for(100); + let nonce = nonce_of(5); + let (mut proof, commitment) = build_proof(&entries, &nonce, &peer); + proof.leaves[0].bytes_hash[0] ^= 0x01; + assert!(matches!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Invalid(_) + )); + } + + #[test] + fn tampered_cut_hash_breaks_root() { + let peer = [9u8; 32]; + let entries = entries_for(256); + let nonce = nonce_of(11); + let (mut proof, commitment) = build_proof(&entries, &nonce, &peer); + if let Some(c) = proof.sibling_cut_hashes.first_mut() { + c[0] ^= 0x01; + } + assert!(matches!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Invalid(_) + )); + } + + #[test] + fn wrong_leaf_count_rejected() { + let peer = [9u8; 32]; + let entries = entries_for(100); + let nonce = nonce_of(5); + let (mut proof, commitment) = build_proof(&entries, &nonce, &peer); + proof.leaves.pop(); + assert_eq!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Invalid("wrong leaf count") + ); + } + + #[test] + fn non_ascending_leaves_rejected() { + let peer = [9u8; 32]; + let entries = entries_for(100); + let nonce = nonce_of(5); + let (mut proof, commitment) = build_proof(&entries, &nonce, &peer); + if proof.leaves.len() >= 2 { + proof.leaves.swap(0, 1); + } + assert!(matches!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Invalid(_) + )); + } + + // ---- spot-check selection --------------------------------------------- + + #[test] + fn spotcheck_indices_in_range_and_distinct() { + let n = 1024; + let nonce = nonce_of(3); + let path = select_subtree_path(&nonce, n).unwrap(); + let k = 8; + let idxs = select_spotcheck_indices(&nonce, &path, k); + assert_eq!( + u32::try_from(idxs.len()).unwrap(), + k.min(path.real_leaf_count()) + ); + let mut seen = std::collections::HashSet::new(); + for i in &idxs { + assert!(*i < path.real_leaf_count()); + assert!(seen.insert(*i), "duplicate spot-check index {i}"); + } + } + + #[test] + fn build_proof_reports_missing_bytes() { + // A responder that no longer holds a selected, committed key's bytes + // must surface MissingBytes (the caller turns this into a confirmed + // failure, not a benign rejection). + let entries = entries_for(100); + let tree = MerkleTree::build(entries).unwrap(); + let nonce = nonce_of(5); + let path = select_subtree_path(&nonce, tree.key_count()).unwrap(); + let victim = tree.key_at(path.leaf_start as usize).unwrap(); + let err = build_subtree_proof(&tree, &nonce, &[1u8; 32], |k| { + if *k == victim { + None + } else { + Some(chunk_bytes(k)) + } + }) + .unwrap_err(); + assert_eq!(err, BuildProofError::MissingBytes { key: victim }); + } + + #[test] + fn spotcheck_returns_all_when_subtree_small() { + // Construct a path with few real leaves. + let path = SubtreePath { + depth: 0, + slot: 0, + leaf_start: 0, + leaf_end: 3, + }; + let idxs = select_spotcheck_indices(&nonce_of(1), &path, 8); + assert_eq!(idxs, vec![0, 1, 2]); + } + + #[test] + fn spotcheck_always_yields_exactly_min_k_n_distinct_indices() { + // The byte sample must NEVER be silently smaller than requested: a + // short sample weakens round 2 without anyone noticing. Exercise many + // nonces, subtree sizes, and k values, and require exactly min(k, n) + // distinct in-range indices every time — plus determinism (auditor and + // responder must derive the same set). + for size in [1u32, 2, 3, 7, 8, 64, 1000] { + let path = SubtreePath { + depth: 0, + slot: 0, + leaf_start: 0, + leaf_end: size, + }; + for k in [1u32, 3, 5, 8] { + for seed in 0..32u8 { + let nonce = nonce_of(seed); + let idxs = select_spotcheck_indices(&nonce, &path, k); + let expected = k.min(path.real_leaf_count()) as usize; + assert_eq!( + idxs.len(), + expected, + "size={size} k={k} seed={seed}: must yield exactly min(k, n)" + ); + let mut seen = std::collections::HashSet::new(); + for i in &idxs { + assert!(*i < path.real_leaf_count(), "index out of range"); + assert!(seen.insert(*i), "duplicate index {i}"); + } + assert_eq!( + idxs, + select_spotcheck_indices(&nonce, &path, k), + "selection must be deterministic" + ); + } + } + } + } + + #[test] + fn fabricated_nonced_hash_caught_by_spotcheck_probability() { + // Simulate the realness check: a responder fabricates a fraction x of + // nonced hashes. The auditor spot-checks k leaves; probability all k + // land on honest leaves is (1-x)^k. Here we just assert the auditor + // *would* catch a fabricated leaf when it samples that position. + let peer = [1u8; 32]; + let entries = entries_for(400); + let nonce = nonce_of(9); + let (mut proof, _commitment) = build_proof(&entries, &nonce, &peer); + // Fabricate the nonced hash on the first subtree leaf (wrong bytes). + proof.leaves[0].nonced_hash[0] ^= 0xFF; + // The realness check the caller runs: recompute from the real chunk + // bytes (the same fixture the honest tree was built from). + let leaf = &proof.leaves[0]; + let real_bytes = chunk_bytes(&leaf.key); + let expected = nonced_leaf_hash(&nonce, &peer, &leaf.key, &real_bytes); + assert_ne!( + leaf.nonced_hash, expected, + "fabricated nonced hash must differ from real" + ); + } + + // ---- branch-substitution attack --------------------------------------- + + #[test] + fn responder_cannot_substitute_a_different_branch() { + // ADR-0002 "Subtree selection": the random value alone fixes WHICH + // branch is selected, so "the audited node cannot choose a convenient + // branch to present." This is the load-bearing anti-substitution claim + // and no existing test exercises it — the tamper tests only mangle a + // hash within the *correct* branch. + // + // Attack: the responder builds a fully valid, internally-consistent + // subtree proof for a DIFFERENT nonce (which the selection maps to a + // different branch of the same committed tree), then presents it as the + // answer to the auditor's nonce. Every leaf hash and every cut-hash is + // genuine, the leaves are strictly ascending, and we deliberately pick + // a decoy whose branch has the SAME leaf count and SAME depth as the + // honest branch — so the cheap "wrong leaf count" / "wrong cut-hash + // count" gates do NOT fire. The ONLY thing that can reject it is the + // structural root re-derivation, which climbs using the auditor's + // nonce-derived slot parity and position. It must reject. + let peer = [0x5Au8; 32]; + let n = 1024u32; // balanced tree; sqrt floor = 32 + let entries = entries_for(n); + + let audit_nonce = nonce_of(7); + let audit_path = select_subtree_path(&audit_nonce, n).unwrap(); + + // Find a decoy nonce whose selected branch is a DIFFERENT slot but the + // SAME depth (hence same real-leaf count for this balanced tree). This + // forces rejection via the root check rather than a count mismatch. + let mut decoy: Option<([u8; 32], SubtreePath)> = None; + for seed in 0u8..=255 { + let cand_nonce = nonce_of(seed); + let cand = select_subtree_path(&cand_nonce, n).unwrap(); + if cand.depth == audit_path.depth + && cand.slot != audit_path.slot + && cand.real_leaf_count() == audit_path.real_leaf_count() + { + decoy = Some((cand_nonce, cand)); + break; + } + } + let (decoy_nonce, decoy_path) = + decoy.expect("a same-depth, different-slot decoy branch must exist for n=1024"); + + // Sanity: the decoy really is a different, equally-shaped branch. + assert_ne!(decoy_path.slot, audit_path.slot); + assert_eq!(decoy_path.depth, audit_path.depth); + assert_eq!(decoy_path.real_leaf_count(), audit_path.real_leaf_count()); + + // The responder builds a genuine proof for the DECOY branch. Note the + // nonced hashes are built with the decoy nonce too — but that does not + // matter: the structural check below never inspects nonced hashes, and + // the attack must already die on structure. + let tree = MerkleTree::build(entries).unwrap(); + let decoy_proof = + build_subtree_proof(&tree, &decoy_nonce, &peer, |k| Some(chunk_bytes(k))).unwrap(); + + // Pin the auditor's commitment to the genuine root of the same tree. + let commitment = fake_commitment(tree.root(), n, peer); + + // The honest answer to the SAME commitment + decoy nonce verifies, so + // the proof itself is well-formed — it is only "wrong" relative to the + // auditor's nonce. + assert_eq!( + verify_subtree_proof(&decoy_proof, &decoy_nonce, &commitment), + StructureVerdict::Valid, + "the decoy proof must be a genuinely valid proof for its own nonce" + ); + + // The attack: present the decoy-branch proof against the AUDIT nonce. + // The count gates cannot fire (same depth + leaf count by construction), + // so this is the root re-derivation rejecting a substituted branch. + let verdict = verify_subtree_proof(&decoy_proof, &audit_nonce, &commitment); + assert_eq!( + verdict, + StructureVerdict::Invalid("root mismatch"), + "substituting a different valid branch must be rejected by the root check, got {verdict:?}" + ); + } +} diff --git a/src/storage/handler.rs b/src/storage/handler.rs index 25c1178b..c4a7f40a 100644 --- a/src/storage/handler.rs +++ b/src/storage/handler.rs @@ -113,6 +113,14 @@ impl AntProtocol { Arc::clone(&self.storage) } + /// Test-only: the record count the quote generator currently prices on. + /// Used to assert that quote-time resync tracks records actually held. + #[cfg(test)] + #[must_use] + pub(crate) fn priced_records_stored(&self) -> usize { + self.quote_generator.records_stored() + } + /// Get a shared reference to the payment verifier. #[must_use] pub fn payment_verifier_arc(&self) -> Arc { @@ -358,12 +366,47 @@ impl AntProtocol { } } + /// Resync the quoting metric to the authoritative count of records the node + /// actually holds. + /// + /// The quote price is driven by `QuoteGenerator::records_stored()`. Reading + /// the live LMDB entry count (an O(1) B-tree page-header read) right before + /// pricing makes the metric deletion-aware: any chunk removed by + /// [`LmdbStorage::delete`] or by the replication prune pass is reflected + /// immediately, with no risk of missing a delete path. + /// + /// On a storage read error — or a count that does not fit `usize` — the + /// previous metric value is left untouched so a transient LMDB error never + /// disrupts quote generation. + fn resync_quote_metric(&self) { + match self.storage.current_chunks() { + // Saturating an overflowing count to usize::MAX would jump the + // metric to the maximum possible price driver; keep the previous + // value instead, as for a read error. + Ok(count) => usize::try_from(count).map_or_else( + |_| { + warn!( + "current_chunks() count {count} overflows usize; keeping previous quote \ + metric" + ); + }, + |records| self.quote_generator.resync_records(records), + ), + Err(e) => { + warn!("Failed to read current_chunks() for quote metric resync: {e}"); + } + } + } + /// Handle a quote request. fn handle_quote(&self, request: &ChunkQuoteRequest) -> ChunkQuoteResponse { let addr_hex = hex::encode(request.address); let data_size = request.data_size; debug!("Handling quote request for {addr_hex} (size: {data_size})"); + // Price on records ACTUALLY HELD, not a monotonic store counter. + self.resync_quote_metric(); + // Check if the chunk is already stored so we can tell the client // to skip payment (already_stored = true). // The match intentionally logs the error when the `logging` feature is @@ -428,6 +471,9 @@ impl AntProtocol { request.merkle_payment_timestamp ); + // Price on records ACTUALLY HELD, not a monotonic store counter. + self.resync_quote_metric(); + let Ok(data_size_usize) = usize::try_from(request.data_size) else { return MerkleCandidateQuoteResponse::Error(ProtocolError::QuoteFailed(format!( "data_size {} overflows usize", @@ -1066,4 +1112,90 @@ mod tests { other => panic!("expected Success with already_stored=false, got: {other:?}"), } } + + /// Drive the real quote handler, then read the record count it priced on. + /// The handler calls `resync_quote_metric` first, so this reflects records + /// ACTUALLY HELD. + fn priced_records_after_quote(protocol: &AntProtocol) -> usize { + let quote_request = ChunkQuoteRequest { + address: [0xAAu8; 32], // a quote-only probe, not one of the stored chunks + data_size: 100, + data_type: DATA_TYPE_CHUNK, + }; + let _ = protocol.handle_quote("e_request); + protocol.priced_records_stored() + } + + /// The quote price must track records ACTUALLY HELD: deleting stored chunks + /// must lower the priced record count, not keep quoting as if the data were + /// still held. Exercises the storage-driven resync in `resync_quote_metric`. + #[tokio::test] + async fn test_quote_metric_reflects_deletions() { + let (protocol, _temp) = create_test_protocol().await; + + // Distinct content -> distinct content-addressed keys. + let contents: Vec> = (0u8..5).map(|i| vec![i; 64]).collect(); + let mut addresses = Vec::new(); + for content in &contents { + let addr = LmdbStorage::compute_address(content); + protocol.put_local(&addr, content).await.expect("put_local"); + addresses.push(addr); + } + + // 5 records held -> priced count 5. + assert_eq!(priced_records_after_quote(&protocol), 5); + + // Delete 2 chunks the node was holding. + for addr in addresses.iter().take(2) { + assert!(protocol.storage().delete(addr).await.expect("delete")); + } + assert_eq!(priced_records_after_quote(&protocol), 3); + + // Delete the rest; priced count floors at 0, never underflows. + for addr in addresses.iter().skip(2) { + assert!(protocol.storage().delete(addr).await.expect("delete")); + } + assert_eq!(priced_records_after_quote(&protocol), 0); + } + + /// Stronger, externally-observable proof: the actual quote PRICE returned + /// to a client must drop after the node deletes data it held. A monotonic + /// store counter would keep the price elevated; the resync ties price to + /// records actually held. + /// FLIPS IF: `resync_quote_metric` is removed — the price would stay at the + /// 10-record level even after deletions (`record_store` only ever increments). + #[tokio::test] + async fn test_quote_price_drops_after_deletion() { + use crate::payment::pricing::calculate_price; + + let (protocol, _temp) = create_test_protocol().await; + let contents: Vec> = (0u8..10).map(|i| vec![i; 64]).collect(); + let mut addresses = Vec::new(); + for content in &contents { + let addr = LmdbStorage::compute_address(content); + protocol.put_local(&addr, content).await.expect("put_local"); + addresses.push(addr); + } + + // Drive a real quote; the priced count must equal records held (10), + // and the price must equal calculate_price(10) — the externally + // observable contract. + assert_eq!(priced_records_after_quote(&protocol), 10); + let price_full = calculate_price(10); + + // Delete 8 of 10 held chunks. + for addr in addresses.iter().take(8) { + assert!(protocol.storage().delete(addr).await.expect("delete")); + } + // The next quote must price on 2 records, and the price must be the + // calculate_price(2) value — strictly different from the 10-record + // price (price is monotonic non-decreasing in records_stored). + assert_eq!(priced_records_after_quote(&protocol), 2); + let price_after = calculate_price(2); + assert!( + price_after < price_full, + "deleting data must lower the observable quote price \ + (full={price_full:?}, after={price_after:?})" + ); + } } diff --git a/tests/e2e/mod.rs b/tests/e2e/mod.rs index 87e63e21..994dc31f 100644 --- a/tests/e2e/mod.rs +++ b/tests/e2e/mod.rs @@ -63,6 +63,9 @@ mod replication; #[cfg(test)] mod security_attacks; +#[cfg(test)] +mod subtree_audit_testnet; + pub use anvil::TestAnvil; pub use harness::TestHarness; pub use testnet::{NetworkState, NodeState, TestNetwork, TestNetworkConfig, TestNode}; diff --git a/tests/e2e/replication.rs b/tests/e2e/replication.rs index 83fc792f..c94e6bbb 100644 --- a/tests/e2e/replication.rs +++ b/tests/e2e/replication.rs @@ -7,6 +7,7 @@ use super::TestHarness; use ant_node::client::compute_address; +use ant_node::replication::commitment_state::{BuiltCommitment, ResponderCommitmentState}; use ant_node::replication::config::REPLICATION_PROTOCOL_ID; use ant_node::replication::protocol::{ compute_audit_digest, AuditChallenge, AuditResponse, FetchRequest, FetchResponse, @@ -389,6 +390,11 @@ async fn test_audit_challenge_returns_correct_digest() { let nonce = [0x42u8; 32]; // Send audit challenge from B to A + // The on-wire `AuditChallenge` is handled by the responsible-chunk audit + // responder (`audit::handle_audit_challenge`), which answers with per-key + // `Digests`. The prune-confirmation audit reuses the same message. (The + // storage-commitment audit uses the separate + // `SubtreeAuditChallenge`/`SubtreeAuditResponse` path.) let challenge = AuditChallenge { challenge_id: 1234, nonce, @@ -528,6 +534,7 @@ async fn test_prune_pass_requires_remote_confirmation_before_delete() { repair_proofs: &repair_proofs, current_sync_epoch: CURRENT_EPOCH, allow_remote_prune_audits: false, + commitment_state: None, }) .await; assert_eq!(blocked.records_pruned, 0); @@ -546,6 +553,7 @@ async fn test_prune_pass_requires_remote_confirmation_before_delete() { repair_proofs: &repair_proofs, current_sync_epoch: CURRENT_EPOCH, allow_remote_prune_audits: true, + commitment_state: None, }) .await; assert_eq!(confirmed.records_pruned, 1); @@ -589,6 +597,7 @@ async fn test_prune_pass_requires_remote_confirmation_before_delete() { repair_proofs: &repair_proofs, current_sync_epoch: CURRENT_EPOCH, allow_remote_prune_audits: true, + commitment_state: None, }) .await; assert_eq!(incomplete.records_pruned, 0); @@ -615,6 +624,7 @@ async fn test_prune_pass_requires_remote_confirmation_before_delete() { repair_proofs: &repair_proofs, current_sync_epoch: CURRENT_EPOCH, allow_remote_prune_audits: true, + commitment_state: None, }) .await; assert_eq!(complete.records_pruned, 1); @@ -626,6 +636,131 @@ async fn test_prune_pass_requires_remote_confirmation_before_delete() { harness.teardown().await.expect("teardown"); } +/// Pruner-retention veto (ADR-0002): a chunk the pruner is no longer responsible +/// for, but which is still committed under a recently-gossiped commitment, must +/// NOT be deleted — the storage-commitment audit's round-2 byte challenge could +/// still demand it, and deleting would turn an honest node's reply into an +/// `Absent` confirmed failure. Once it is no longer committed (e.g. it has aged +/// out of the retention window, simulated here by passing `None`), the same +/// out-of-range record becomes prunable. Drives the real `run_prune_pass` +/// against live nodes. +#[tokio::test] +#[serial] +async fn test_prune_veto_for_committed_out_of_range_key() { + const HINT_EPOCH: u64 = 7; + const CURRENT_EPOCH: u64 = HINT_EPOCH + 1; + + let harness = TestHarness::setup_minimal().await.expect("setup"); + harness.warmup_dht().await.expect("warmup"); + + let pruner_idx = 3; + let close_group_size = 2; + let config = prune_test_config(close_group_size); + let sync_state = Arc::new(RwLock::new(NeighborSyncState::new_cycle(vec![]))); + let repair_proofs = Arc::new(RwLock::new(RepairProofs::new())); + + let pruner = harness.test_node(pruner_idx).expect("pruner"); + let pruner_p2p = Arc::clone(pruner.p2p_node.as_ref().expect("pruner p2p")); + let pruner_storage = pruner.ant_protocol.as_ref().expect("protocol").storage(); + let pruner_paid_list = Arc::clone( + pruner + .replication_engine + .as_ref() + .expect("engine") + .paid_list(), + ); + let pruner_peer = *pruner_p2p.peer_id(); + + // An out-of-range record fully confirmed on its remote close group — so the + // ONLY thing that can keep it on the pruner is the retention veto. + let (content, address, targets) = + find_remote_prune_candidate(&harness, pruner_idx, close_group_size, "veto").await; + pruner_storage + .put(&address, &content) + .await + .expect("put record on pruner"); + store_record_on_peers(&harness, &targets, &address, &content).await; + record_repair_proofs_for_peers( + &repair_proofs, + &pruner_p2p, + &config, + &targets, + &address, + HINT_EPOCH, + ) + .await; + + // A retained commitment that COMMITS to the out-of-range key (as if we + // gossiped it just before the key left our range). A throwaway keypair is + // fine: the pruner's veto consults only `is_held` (membership), not the + // signature. + let committed = ResponderCommitmentState::new(); + { + let (pk, sk) = saorsa_pqc::api::sig::ml_dsa_65() + .generate_keypair() + .expect("keypair"); + let bytes_hash = *blake3::hash(&content).as_bytes(); + let built = + BuiltCommitment::build(vec![(address, bytes_hash)], &[0; 32], &sk, &pk.to_bytes()) + .expect("build commitment"); + let h = built.hash(); + committed.rotate(built); + committed.mark_gossiped(h); + } + let committed = Arc::new(committed); + assert!(committed.is_held(&address), "test setup: key must be held"); + + // With the key still committed, an otherwise-fully-prunable out-of-range + // record is VETOED. + let vetoed = pruning::run_prune_pass_with_context(pruning::PrunePassContext { + self_id: &pruner_peer, + storage: &pruner_storage, + paid_list: &pruner_paid_list, + p2p_node: &pruner_p2p, + config: &config, + sync_state: &sync_state, + repair_proofs: &repair_proofs, + current_sync_epoch: CURRENT_EPOCH, + allow_remote_prune_audits: true, + commitment_state: Some(&committed), + }) + .await; + assert_eq!( + vetoed.records_pruned, 0, + "a key still committed under a recent commitment must not be pruned" + ); + assert!( + pruner_storage.exists(&address).expect("exists"), + "the vetoed record must remain on disk" + ); + + // Once it is no longer committed (aged out of the retention window — modelled + // by `None`), the same out-of-range record is prunable. + let pruned = pruning::run_prune_pass_with_context(pruning::PrunePassContext { + self_id: &pruner_peer, + storage: &pruner_storage, + paid_list: &pruner_paid_list, + p2p_node: &pruner_p2p, + config: &config, + sync_state: &sync_state, + repair_proofs: &repair_proofs, + current_sync_epoch: CURRENT_EPOCH, + allow_remote_prune_audits: true, + commitment_state: None, + }) + .await; + assert_eq!( + pruned.records_pruned, 1, + "once no longer committed, the out-of-range record prunes normally" + ); + assert!( + !pruner_storage.exists(&address).expect("exists"), + "the no-longer-committed record is reclaimed" + ); + + harness.teardown().await.expect("teardown"); +} + /// Fetch not-found returns `NotFound`. /// /// Request a key that does not exist on the target node and verify @@ -805,6 +940,7 @@ async fn test_neighbor_sync_request_returns_hints() { replica_hints: vec![], paid_hints: vec![], bootstrapping: false, + commitment: None, }; let msg = ReplicationMessage { request_id: 2000, @@ -1254,6 +1390,7 @@ async fn scenario_14_sync_hints_cover_all_local_keys() { replica_hints: vec![], paid_hints: vec![], bootstrapping: false, + commitment: None, }; let msg = ReplicationMessage { request_id: 1400, @@ -1401,6 +1538,7 @@ async fn scenario_17_bidirectional_sync_when_sender_in_rt() { replica_hints: vec![inbound_hint], paid_hints: vec![], bootstrapping: false, + commitment: None, }; let msg = ReplicationMessage { request_id: 1700, diff --git a/tests/e2e/subtree_audit_testnet.rs b/tests/e2e/subtree_audit_testnet.rs new file mode 100644 index 00000000..c21dce66 --- /dev/null +++ b/tests/e2e/subtree_audit_testnet.rs @@ -0,0 +1,198 @@ +//! Local-testnet end-to-end tests for the gossip-triggered contiguous-subtree +//! storage audit (ADR-0002). +//! +//! These spin a real multi-node testnet and drive the SHIPPED audit over the +//! live wire (real `handle_subtree_challenge` responder + `run_subtree_audit` +//! auditor + real LMDB storage), via the test-only `audit_peer_now` / +//! `rebuild_commitment_now` engine hooks. They prove the two outcomes that +//! matter for a testnet: +//! +//! 1. HONEST: an honest node that holds its committed data passes the audit +//! (no false-positive eviction). +//! 2. ADVERSARY: a node that deletes the bytes it committed to fails the audit +//! (a confirmed failure that, once eviction is re-enabled, evicts it) while +//! honest nodes are unaffected. + +#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] + +use super::TestHarness; +use ant_node::replication::audit::AuditTickResult; +use serial_test::serial; + +/// Store the same `n` chunks on both `a` (the audited holder) and `b` (the +/// auditor — NOT because verification needs them: round 2 demands the bytes +/// from `a` itself, so `b` could hold nothing; storing them just makes `b` a +/// realistic co-holder of the keyspace), make `a` commit to them, +/// then deterministically seed `b`'s cache with `a`'s commitment (simulating +/// "b received a's gossip" without depending on neighbor-sync timing — that +/// propagation is covered by the dedicated neighbor-sync tests). After this, +/// `b.audit_peer_now(a)` pins `a`'s real commitment and runs the audit over the +/// live wire against `a`'s real responder. +async fn commit_and_seed( + harness: &TestHarness, + a_idx: usize, + b_idx: usize, + n: usize, +) -> Vec<[u8; 32]> { + let a = harness.test_node(a_idx).expect("node a"); + let b = harness.test_node(b_idx).expect("node b"); + let a_store = a.ant_protocol.as_ref().expect("a protocol").storage(); + let b_store = b.ant_protocol.as_ref().expect("b protocol").storage(); + + // Store identical chunks on A and B. Content-addressed: addr == BLAKE3(bytes). + let mut addrs = Vec::with_capacity(n); + for i in 0..n { + let content = format!("subtree-audit-testnet-chunk-{i}").into_bytes(); + let address = *blake3::hash(&content).as_bytes(); + a_store.put(&address, &content).await.expect("put on a"); + b_store.put(&address, &content).await.expect("put on b"); + addrs.push(address); + } + + // A commits to its current key set. + let a_engine = a.replication_engine.as_ref().expect("a engine"); + a_engine + .rebuild_commitment_now() + .await + .expect("a rebuild commitment"); + + // Grab A's freshly built commitment and seed it into B's cache so B can pin + // it (deterministic; no gossip-timing flake). + let a_peer = *a.p2p_node.as_ref().expect("a p2p").peer_id(); + let a_commitment = a_engine + .commitment_state() + .current() + .expect("a has a current commitment") + .commitment() + .clone(); + let b_engine = b.replication_engine.as_ref().expect("b engine"); + b_engine + .inject_peer_commitment_for_test(&a_peer, a_commitment) + .await; + addrs +} + +/// HONEST: a node holding its committed data passes the subtree audit. +#[tokio::test] +#[serial] +async fn honest_node_passes_subtree_audit() { + let harness = TestHarness::setup_small().await.expect("setup"); + harness.warmup_dht().await.expect("warmup"); + + let (a_idx, b_idx) = (3, 4); + commit_and_seed(&harness, a_idx, b_idx, 64).await; + + let a_peer = *harness + .test_node(a_idx) + .expect("a") + .p2p_node + .as_ref() + .expect("a p2p") + .peer_id(); + let b_engine = harness + .test_node(b_idx) + .expect("b") + .replication_engine + .as_ref() + .expect("b engine"); + + // Honest holder: B holds the chunks so it byte-verifies the proof → Passed. + let result = b_engine.audit_peer_now(&a_peer).await; + assert!( + matches!(result, AuditTickResult::Passed { keys_checked, .. } if keys_checked >= 1), + "honest node must pass with at least one byte-verified leaf, got {result:?}" + ); + + harness.teardown().await.expect("teardown"); +} + +/// ADVERSARY: a node that deletes the bytes it committed to FAILS the audit, +/// while honest peers are unaffected. +#[tokio::test] +#[serial] +async fn data_deleting_node_fails_subtree_audit() { + let harness = TestHarness::setup_small().await.expect("setup"); + harness.warmup_dht().await.expect("warmup"); + + let (a_idx, b_idx) = (5, 6); + let addrs = commit_and_seed(&harness, a_idx, b_idx, 64).await; + + // A is now committed-and-gossiped. The adversary deletes ALL the bytes it + // committed to (keeps the gossiped commitment — the classic "claim storage, + // hold nothing" attack). It does NOT rebuild its commitment, so it still + // advertises the now-unbacked root. + let a_store = harness + .test_node(a_idx) + .expect("a") + .ant_protocol + .as_ref() + .expect("a protocol") + .storage(); + for addr in &addrs { + a_store.delete(addr).await.expect("delete on adversary"); + } + + let a_peer = *harness + .test_node(a_idx) + .expect("a") + .p2p_node + .as_ref() + .expect("a p2p") + .peer_id(); + let b_engine = harness + .test_node(b_idx) + .expect("b") + .replication_engine + .as_ref() + .expect("b engine"); + + let result = b_engine.audit_peer_now(&a_peer).await; + // The adversary can no longer produce the subtree's bytes, so its responder + // rejects ("missing bytes for committed key") → a confirmed Failed. (It must + // NOT be Passed; Idle would mean B couldn't reach the audit, also a failure + // of the test setup.) + assert!( + matches!(result, AuditTickResult::Failed { .. }), + "a node that deleted its committed data must FAIL the audit, got {result:?}" + ); + + harness.teardown().await.expect("teardown"); +} + +/// NO FALSE POSITIVE: auditing an honest node repeatedly (different nonces) +/// never produces a confirmed failure. +#[tokio::test] +#[serial] +async fn honest_node_never_false_fails_across_repeated_audits() { + let harness = TestHarness::setup_small().await.expect("setup"); + harness.warmup_dht().await.expect("warmup"); + + let (a_idx, b_idx) = (7, 8); + commit_and_seed(&harness, a_idx, b_idx, 100).await; + + let a_peer = *harness + .test_node(a_idx) + .expect("a") + .p2p_node + .as_ref() + .expect("a p2p") + .peer_id(); + let b_engine = harness + .test_node(b_idx) + .expect("b") + .replication_engine + .as_ref() + .expect("b engine"); + + // Each audit uses a fresh random nonce (different selected subtree). None may + // ever be a confirmed Failed for an honest holder. + for round in 0..8 { + let result = b_engine.audit_peer_now(&a_peer).await; + assert!( + !matches!(result, AuditTickResult::Failed { .. }), + "honest node false-failed on round {round}: {result:?}" + ); + } + + harness.teardown().await.expect("teardown"); +} diff --git a/tests/e2e/testnet.rs b/tests/e2e/testnet.rs index 14216be0..3ae6c226 100644 --- a/tests/e2e/testnet.rs +++ b/tests/e2e/testnet.rs @@ -1239,16 +1239,23 @@ impl TestNetwork { })); } - // Start replication engine for this node - if let (Some(ref p2p), Some(ref protocol)) = (&node.p2p_node, &node.ant_protocol) { + // Start replication engine for this node. A node without an identity + // skips ONLY the engine (no early return — the node must still be + // tracked in `self.nodes` below, or its already-started P2P/protocol + // tasks would keep running untracked by the harness). + if let (Some(ref p2p), Some(ref protocol), Some(ref id)) = + (&node.p2p_node, &node.ant_protocol, &node.node_identity) + { let shutdown = CancellationToken::new(); let repl_config = ReplicationConfig::default(); let (_fresh_tx, fresh_rx) = tokio::sync::mpsc::unbounded_channel(); + let node_identity = Arc::clone(id); match ReplicationEngine::new( repl_config, Arc::clone(p2p), protocol.storage(), protocol.payment_verifier_arc(), + node_identity, &node.data_dir, fresh_rx, shutdown.clone(), @@ -1269,6 +1276,11 @@ impl TestNetwork { ); } } + } else if node.node_identity.is_none() { + warn!( + "Node {} has no identity; skipping replication engine", + node.index + ); } debug!("Node {} started successfully", node.index); diff --git a/tests/poc_audit_handler_live.rs b/tests/poc_audit_handler_live.rs new file mode 100644 index 00000000..89e7326e --- /dev/null +++ b/tests/poc_audit_handler_live.rs @@ -0,0 +1,427 @@ +//! Live responder-handler integration tests for the gossip-triggered +//! contiguous-subtree storage audit (ADR-0002). +//! +//! The pure proof maths are covered by the unit tests in +//! `src/replication/subtree.rs`, and the end-to-end attack composition by +//! `poc_commitment_audit_attacks`. This file fills the remaining gap: the +//! *live* responder control-flow branches in +//! [`ant_node::replication::storage_commitment_audit::handle_subtree_challenge`] — the function the +//! network actually calls — driven against a real `LmdbStorage` and a real +//! `ResponderCommitmentState`, asserting on the exact `SubtreeAuditResponse` +//! variant produced. +//! +//! Each test is written to FAIL if the defence it covers is removed — see the +//! `// FLIPS IF:` note on each. They are not tautologies: the responder under +//! test is the production code path, not a reimplementation. + +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::missing_panics_doc, + clippy::cast_possible_truncation +)] + +use std::sync::Arc; + +use ant_node::replication::commitment_state::{BuiltCommitment, ResponderCommitmentState}; +use ant_node::replication::config::MAX_BYTE_CHALLENGE_KEYS; +use ant_node::replication::protocol::{ + SubtreeAuditChallenge, SubtreeAuditResponse, SubtreeByteChallenge, SubtreeByteItem, + SubtreeByteResponse, +}; +use ant_node::replication::storage_commitment_audit::{ + handle_subtree_byte_challenge, handle_subtree_challenge, +}; +use ant_node::replication::subtree::{verify_subtree_proof, StructureVerdict}; +use ant_node::storage::{LmdbStorage, LmdbStorageConfig}; +use saorsa_core::identity::PeerId; +use saorsa_pqc::api::sig::{ml_dsa_65, MlDsaPublicKey, MlDsaSecretKey}; +use tempfile::TempDir; + +// --------------------------------------------------------------------------- +// Fixtures +// --------------------------------------------------------------------------- + +async fn test_storage() -> (LmdbStorage, TempDir) { + let temp_dir = TempDir::new().expect("create temp dir"); + let config = LmdbStorageConfig { + root_dir: temp_dir.path().to_path_buf(), + ..LmdbStorageConfig::test_default() + }; + let storage = LmdbStorage::new(config).await.expect("create storage"); + (storage, temp_dir) +} + +fn keypair() -> (MlDsaPublicKey, MlDsaSecretKey) { + ml_dsa_65().generate_keypair().unwrap() +} + +/// Deterministic chunk content for index `i` (>= store MIN size). Distinct per +/// index so each address is distinct. +fn chunk_content(i: u8) -> Vec { + (0..1024u32).map(|n| (n as u8) ^ i).collect() +} + +/// A responder identity bound to a freshly-built commitment over the given +/// chunk indices, with those chunks actually stored in `storage`. +struct Responder { + peer_id: PeerId, + peer_id_bytes: [u8; 32], + state: Arc, +} + +impl Responder { + /// Build a responder that has stored `indices` and committed to them. + /// The committed leaf binds `(address, BLAKE3(content))`; the responder + /// reads bytes by address at audit time and rehashes them. + async fn new(storage: &LmdbStorage, indices: &[u8]) -> Self { + let (pk, sk) = keypair(); + // Production identity derivation: peer_id == BLAKE3(pubkey_bytes). + let peer_id_bytes = *blake3::hash(&pk.to_bytes()).as_bytes(); + let peer_id = PeerId::from_bytes(peer_id_bytes); + + let mut entries = Vec::new(); + for &i in indices { + let content = chunk_content(i); + let addr = LmdbStorage::compute_address(&content); + storage.put(&addr, &content).await.expect("put chunk"); + let bytes_hash = *blake3::hash(&content).as_bytes(); + entries.push((addr, bytes_hash)); + } + let built = + BuiltCommitment::build(entries, &peer_id_bytes, &sk, &pk.to_bytes()).expect("build"); + let state = Arc::new(ResponderCommitmentState::new()); + state.rotate(built); + + Self { + peer_id, + peer_id_bytes, + state, + } + } + + fn current_hash(&self) -> [u8; 32] { + self.state.current().unwrap().hash() + } + + fn address(i: u8) -> [u8; 32] { + LmdbStorage::compute_address(&chunk_content(i)) + } +} + +fn challenge_for(responder: &Responder, pin: [u8; 32], nonce: [u8; 32]) -> SubtreeAuditChallenge { + SubtreeAuditChallenge { + challenge_id: 42, + nonce, + challenged_peer_id: responder.peer_id_bytes, + expected_commitment_hash: pin, + } +} + +// --------------------------------------------------------------------------- +// 1. Honest responder, pinned to its gossiped commitment -> Proof +// --------------------------------------------------------------------------- + +/// Baseline: a challenge pinned to the responder's retained commitment, with +/// all committed bytes present, yields a `Proof` whose commitment matches the +/// pin and whose subtree proof passes `verify_subtree_proof`. Anchors the +/// failure-path tests — it proves the happy path is reachable, so a Rejected in +/// another test is the defence firing, not an unrelated error. +#[tokio::test] +async fn honest_responder_answers_with_valid_proof() { + let (storage, _t) = test_storage().await; + // Enough leaves to exercise a real (non-whole-tree) subtree selection. + let indices: Vec = (1..=64u8).collect(); + let r = Responder::new(&storage, &indices).await; + let pin = r.current_hash(); + let nonce = [0x11u8; 32]; + let challenge = challenge_for(&r, pin, nonce); + + let resp = + handle_subtree_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)).await; + + match resp { + SubtreeAuditResponse::Proof { + challenge_id, + commitment, + proof, + } => { + assert_eq!(challenge_id, 42); + // The answered commitment is the pinned one. + assert_eq!( + ant_node::replication::commitment::commitment_hash(&commitment), + Some(pin), + ); + // And the proof structurally verifies under the nonce + commitment. + assert_eq!( + verify_subtree_proof(&proof, &nonce, &commitment), + StructureVerdict::Valid, + "honest responder's proof must verify" + ); + } + other => panic!("expected Proof, got {other:?}"), + } +} + +// --------------------------------------------------------------------------- +// 2. Bootstrapping responder -> Bootstrapping (never penalised) +// --------------------------------------------------------------------------- + +/// A responder still bootstrapping answers `Bootstrapping`, not a proof — it +/// must not be penalised for not yet holding data. +/// +/// FLIPS IF: the bootstrap shortcut were removed and a bootstrapping node tried +/// (and failed) to build a proof, exposing fresh nodes to audit penalties. +#[tokio::test] +async fn bootstrapping_responder_reports_bootstrapping() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + let pin = r.current_hash(); + let challenge = challenge_for(&r, pin, [0x11u8; 32]); + + let resp = handle_subtree_challenge( + &challenge, + &storage, + &r.peer_id, + /* is_bootstrapping */ true, + Some(&r.state), + ) + .await; + + assert!( + matches!( + resp, + SubtreeAuditResponse::Bootstrapping { challenge_id: 42 } + ), + "expected Bootstrapping, got {resp:?}" + ); +} + +// --------------------------------------------------------------------------- +// 3. Challenge targeting the wrong peer -> Rejected +// --------------------------------------------------------------------------- + +/// A challenge whose `challenged_peer_id` is not this node is rejected — a node +/// must only answer audits addressed to it (so an attacker can't make node A +/// answer for node B's committed tree). +/// +/// FLIPS IF: the target-peer check were dropped and a node answered challenges +/// addressed to anyone. +#[tokio::test] +async fn wrong_target_peer_is_rejected() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + let pin = r.current_hash(); + let mut challenge = challenge_for(&r, pin, [0x11u8; 32]); + // Address the challenge to a different peer. + challenge.challenged_peer_id = [0x99u8; 32]; + + let resp = + handle_subtree_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)).await; + + match resp { + SubtreeAuditResponse::Rejected { + challenge_id, + reason, + } => { + assert_eq!(challenge_id, 42); + assert!( + reason.contains("does not match this node"), + "expected wrong-peer rejection, got: {reason}" + ); + } + other => panic!("expected Rejected(wrong peer), got {other:?}"), + } +} + +// --------------------------------------------------------------------------- +// 4. Pinned hash the responder does not retain -> Rejected "unknown commitment" +// --------------------------------------------------------------------------- + +/// A challenge pinned to a commitment hash the responder's state does not +/// contain is rejected with "unknown commitment hash", NOT silently answered +/// against the current commitment. Since the auditor only pins a hash the peer +/// just gossiped, this rejection is the auditor's confirmed-failure signal. +/// +/// FLIPS IF: the responder ignored the pin and answered against its current +/// commitment regardless — the pin contract would be void and a lazy node could +/// answer any challenge with any tree. +#[tokio::test] +async fn unknown_pinned_hash_is_rejected() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + // A hash the responder never built/retained. + let bogus_pin = [0x99u8; 32]; + let challenge = challenge_for(&r, bogus_pin, [0x11u8; 32]); + + let resp = + handle_subtree_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)).await; + + match resp { + SubtreeAuditResponse::Rejected { reason, .. } => { + assert!( + reason.contains("unknown commitment hash"), + "expected unknown-commitment-hash rejection, got: {reason}" + ); + } + other => panic!("expected Rejected(unknown commitment hash), got {other:?}"), + } +} + +/// No commitment state at all (e.g. before the first rotation during rollout) +/// is likewise rejected — there is nothing to answer the pin against. +#[tokio::test] +async fn missing_commitment_state_is_rejected() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + let pin = r.current_hash(); + let challenge = challenge_for(&r, pin, [0x11u8; 32]); + + // Pass None for commitment_state. + let resp = handle_subtree_challenge(&challenge, &storage, &r.peer_id, false, None).await; + + assert!( + matches!(resp, SubtreeAuditResponse::Rejected { .. }), + "expected Rejected when no commitment state, got {resp:?}" + ); +} + +// --------------------------------------------------------------------------- +// 5. Committed key whose bytes were deleted -> Rejected "missing bytes..." +// --------------------------------------------------------------------------- + +/// The chunk-deleter case: the responder committed to a key, the auditor pins +/// that commitment, but the responder has since dropped the actual bytes for a +/// key the nonce-selected subtree covers. It cannot fabricate the leaf (the +/// nonced hash is bound to the bytes), so it rejects with the distinct "missing +/// bytes for committed key" reason — which the auditor treats as real storage +/// loss and penalises. +/// +/// To guarantee the deleted key falls inside the selected subtree, we delete +/// EVERY committed chunk's bytes, so whichever leaves the nonce selects, at +/// least one is missing. +/// +/// FLIPS IF: the responder could answer a committed key without holding the +/// bytes — exactly the storage-binding hole the subtree audit closes. +#[tokio::test] +async fn committed_key_with_missing_bytes_is_rejected() { + let (storage, _t) = test_storage().await; + let indices: Vec = (1..=32u8).collect(); + let r = Responder::new(&storage, &indices).await; + let pin = r.current_hash(); + + // Drop the bytes for every committed chunk AFTER committing, so any selected + // subtree contains at least one key whose bytes are gone. + for &i in &indices { + let addr = Responder::address(i); + storage.delete(&addr).await.expect("delete chunk"); + } + + let challenge = challenge_for(&r, pin, [0x11u8; 32]); + let resp = + handle_subtree_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)).await; + + match resp { + SubtreeAuditResponse::Rejected { reason, .. } => { + assert!( + reason.contains("missing bytes for committed key"), + "expected missing-bytes rejection, got: {reason}" + ); + } + other => panic!("expected Rejected(missing bytes), got {other:?}"), + } +} + +// --------------------------------------------------------------------------- +// 6. Round 2 (byte challenge): honest serve + oversize-request rejection +// --------------------------------------------------------------------------- + +/// Round-2 happy path: a byte challenge pinned to the responder's retained +/// commitment, for keys it committed to and still stores, returns `Items` with +/// the ORIGINAL bytes (`Present`) for every requested key. +/// +/// FLIPS IF: the responder stops serving original bytes for committed keys — +/// the auditor would then see byte-verification failures for honest nodes. +#[tokio::test] +async fn byte_challenge_serves_original_bytes_for_committed_keys() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + let pin = r.current_hash(); + + let keys = vec![Responder::address(1), Responder::address(2)]; + let challenge = SubtreeByteChallenge { + challenge_id: 43, + nonce: [0x22u8; 32], + challenged_peer_id: r.peer_id_bytes, + expected_commitment_hash: pin, + keys: keys.clone(), + }; + + let resp = + handle_subtree_byte_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)) + .await; + + match resp { + SubtreeByteResponse::Items { + challenge_id, + items, + } => { + assert_eq!(challenge_id, 43); + assert_eq!(items.len(), keys.len(), "one item per requested key"); + for (item, (i, key)) in items.iter().zip([1u8, 2].into_iter().zip(keys)) { + match item { + SubtreeByteItem::Present { key: k, bytes } => { + assert_eq!(*k, key); + assert_eq!(*bytes, chunk_content(i), "must serve the ORIGINAL bytes"); + } + other @ SubtreeByteItem::Absent { .. } => { + panic!("expected Present for stored committed key, got {other:?}") + } + } + } + } + other => panic!("expected Items, got {other:?}"), + } +} + +/// A byte challenge requesting more than `MAX_BYTE_CHALLENGE_KEYS` keys is +/// rejected up front: an honest auditor batches its sample to that cap so the +/// worst-case response (all chunks at max size) fits the replication wire cap; +/// anything larger is a forged/over-size request the responder must not try to +/// serve (the response could not encode, and reading the chunks first would be +/// disk-read amplification). +/// +/// FLIPS IF: the per-challenge key cap is removed from the responder. +#[tokio::test] +async fn oversize_byte_challenge_is_rejected() { + let (storage, _t) = test_storage().await; + let r = Responder::new(&storage, &[1, 2, 3, 4]).await; + let pin = r.current_hash(); + + let keys: Vec<[u8; 32]> = (0..=MAX_BYTE_CHALLENGE_KEYS) + .map(|i| [u8::try_from(i % 251).unwrap_or(0); 32]) + .collect(); + assert!(keys.len() > MAX_BYTE_CHALLENGE_KEYS); + let challenge = SubtreeByteChallenge { + challenge_id: 44, + nonce: [0x33u8; 32], + challenged_peer_id: r.peer_id_bytes, + expected_commitment_hash: pin, + keys, + }; + + let resp = + handle_subtree_byte_challenge(&challenge, &storage, &r.peer_id, false, Some(&r.state)) + .await; + + match resp { + SubtreeByteResponse::Rejected { reason, .. } => { + assert!( + reason.contains("max"), + "expected per-challenge key-cap rejection, got: {reason}" + ); + } + other => panic!("expected Rejected(oversize), got {other:?}"), + } +} diff --git a/tests/poc_bootstrap_stall.rs b/tests/poc_bootstrap_stall.rs new file mode 100644 index 00000000..4c0cc8cb --- /dev/null +++ b/tests/poc_bootstrap_stall.rs @@ -0,0 +1,266 @@ +//! Proof-of-concept regression test for the **bootstrap stall** attack +//! against the neighbour-sync admission / drain detector. +//! +//! ## The attack (no fix yet) +//! +//! While a node is bootstrapping, every inbound `NeighborSyncRequest` +//! whose admission overflows `MAX_PENDING_VERIFY_PER_PEER` (the per-peer +//! cap is the first to bite for any single peer) calls +//! `bootstrap::note_capacity_rejected(source)`. The drain check in +//! `bootstrap::check_bootstrap_drained` then refuses to complete +//! bootstrap while the set is non-empty: +//! +//! ```ignore +//! if !state.capacity_rejected_sources.is_empty() { +//! return false; // "not yet drained" +//! } +//! ``` +//! +//! The set entry for `source` is cleared only when **the same source** +//! later completes an admission cycle with zero rejections. A single +//! peer that keeps sending over-cap hints faster than the verification +//! queue drains never has a "clean cycle" — so it is **permanently** +//! in `capacity_rejected_sources`, and bootstrap **never completes**. +//! +//! ## Why this matters +//! +//! While `is_bootstrapping == true`: +//! - **Audits are paused** (`replication::audit::audit_tick` returns +//! `Idle` if `is_bootstrapping`, see `audit.rs` Invariant 19). A +//! victim stuck in bootstrap mode is effectively a node that does no +//! auditing — bad nodes around it accrue no trust penalties. +//! - Other replication invariants gated on `bootstrap_drained` (paid +//! list repair flow, prune confirmation paths) also stay off. +//! +//! A single Byzantine peer in the victim's routing table can therefore +//! disable the entire reputation system on that victim, for free, +//! using nothing but well-formed `NeighborSyncRequest` messages that +//! the victim's admission path accepts as legitimate. +//! +//! ## What this test proves +//! +//! Drives the in-process pieces (`ReplicationQueues`, `BootstrapState`, +//! `bootstrap::note_capacity_rejected` / +//! `bootstrap::check_bootstrap_drained`) end-to-end through the same +//! call sequence that the live replication loop runs when handling an +//! over-cap `NeighborSyncRequest`. With no fix this test passes — i.e. +//! it documents the buggy behaviour by asserting the victim never +//! drains. The fix (whatever shape it takes — per-source rate limits, +//! capacity-reject decay, trust-event escalation, ...) will need a +//! follow-up test asserting drain happens within a bounded number of +//! over-cap cycles. + +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::missing_panics_doc, + clippy::significant_drop_tightening +)] + +use std::collections::HashSet; +use std::sync::Arc; +use std::time::Instant; + +use tokio::sync::RwLock; + +use ant_node::replication::bootstrap::{ + check_bootstrap_drained, clear_capacity_rejected, note_capacity_rejected, +}; +use ant_node::replication::scheduling::{ + AdmissionResult, ReplicationQueues, MAX_PENDING_VERIFY_PER_PEER, +}; +use ant_node::replication::types::{ + BootstrapState, HintPipeline, VerificationEntry, VerificationState, +}; +use saorsa_core::identity::PeerId; + +fn peer(b: u8) -> PeerId { + let mut bytes = [0u8; 32]; + bytes[0] = b; + PeerId::from_bytes(bytes) +} + +fn entry(sender: PeerId) -> VerificationEntry { + VerificationEntry { + state: VerificationState::PendingVerify, + pipeline: HintPipeline::Replica, + verified_sources: Vec::new(), + tried_sources: HashSet::new(), + created_at: Instant::now(), + hint_sender: sender, + } +} + +fn unique_key(i: u32) -> [u8; 32] { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_le_bytes()); + k +} + +/// Simulates one inbound `NeighborSyncRequest` from `source` carrying +/// `hint_count` hints — returns the number of admissions that capacity- +/// rejected (i.e. what `AdmissionOutcome::capacity_rejected_count` would +/// be in the live loop), and as a side effect mutates `queues` and the +/// bootstrap-state in exactly the same way the live `admit_and_queue_hints` +/// followed by the bootstrap-drain accounting do. +async fn simulate_inbound_sync( + queues: &Arc>, + bootstrap_state: &Arc>, + source: PeerId, + key_offset: u32, + hint_count: u32, +) -> usize { + let mut capacity_rejected_count: usize = 0; + + { + let mut q = queues.write().await; + for i in 0..hint_count { + let result = q.add_pending_verify(unique_key(key_offset + i), entry(source)); + match result { + AdmissionResult::Admitted | AdmissionResult::AlreadyPresent => {} + AdmissionResult::CapacityRejected => { + capacity_rejected_count += 1; + } + } + } + } + + // Mirror replication/mod.rs:1391-1400: while bootstrapping, note or + // clear capacity rejection for this source based on the outcome. + if capacity_rejected_count > 0 { + note_capacity_rejected(bootstrap_state, source).await; + } else { + clear_capacity_rejected(bootstrap_state, &source).await; + } + + capacity_rejected_count +} + +/// **The attack.** A single peer keeps the victim's bootstrap permanently +/// undrained by always sending one more hint than the per-peer pending +/// quota can accept. The victim's `capacity_rejected_sources` set stays +/// non-empty forever, so `check_bootstrap_drained` never returns `true`. +/// +/// Pre-fix behaviour: this test passes (the attack succeeds — drain never +/// completes). The presence of this test is the regression marker. +/// +/// Post-fix behaviour: the fix MUST cause `check_bootstrap_drained` to +/// return `true` within a bounded number of cycles regardless of attacker +/// flood pattern. A follow-up test should assert that bound. +#[tokio::test] +async fn poc_bootstrap_stall_via_persistent_per_peer_overflow() { + let queues = Arc::new(RwLock::new(ReplicationQueues::new())); + let bootstrap_state = Arc::new(RwLock::new(BootstrapState::new())); + + let attacker = peer(0xAA); + + // Round 1: attacker sends per-peer-cap + 1 hints. The first + // MAX_PENDING_VERIFY_PER_PEER admit; the last over-cap one rejects. + // After this round, `capacity_rejected_sources` contains the attacker. + let mut next_key: u32 = 0; + #[allow(clippy::cast_possible_truncation)] + let flood = MAX_PENDING_VERIFY_PER_PEER as u32 + 1; + let rejected = + simulate_inbound_sync(&queues, &bootstrap_state, attacker, next_key, flood).await; + next_key += flood; + assert!( + rejected >= 1, + "round 1 must over-cap (got {rejected} rejections); test is mis-sized" + ); + + // Victim has nothing else outstanding: no other pending peer requests, + // no other pending keys discovered. The ONLY thing preventing drain + // is `capacity_rejected_sources` containing the attacker. + let drained_before_attack_continues = { + let q = queues.read().await; + check_bootstrap_drained(&bootstrap_state, &q).await + }; + assert!( + !drained_before_attack_continues, + "bootstrap must NOT drain while attacker has outstanding capacity-rejected hints" + ); + + // Round 2..N: attacker keeps sending one more over-cap hint each + // round. In the live loop, the victim's verification cycle would + // drain a few entries between rounds, but the attacker just sends + // more hints than fit. Here we simulate that pattern by NEVER + // draining queues between attacker rounds: this is the worst-case + // for the victim and matches an attacker who paces hints to keep + // pending_per_sender[attacker] always at the cap. + for round in 0..32 { + let r = simulate_inbound_sync(&queues, &bootstrap_state, attacker, next_key, 1).await; + next_key += 1; + // Each round must keep capacity-rejecting (per-peer cap still hit + // because we never freed slots for this sender). + assert!( + r >= 1, + "round {round}: attacker hint must continue to capacity-reject \ + (per-peer cap still full); got {r}" + ); + + let drained = { + let q = queues.read().await; + check_bootstrap_drained(&bootstrap_state, &q).await + }; + assert!( + !drained, + "round {round}: bootstrap drained despite attacker still capacity-rejecting" + ); + } + + // After 32 rounds (could be 32 million) the attacker is STILL in + // `capacity_rejected_sources`. The victim is permanently in + // bootstrap mode. This is the bug. + let state = bootstrap_state.read().await; + assert!( + state.capacity_rejected_sources.contains(&attacker), + "attacker peer is still in capacity_rejected_sources after the flood — \ + this is the documented stall: the victim has no mechanism to retire \ + the attacker without the attacker's cooperation (a 'clean' admission \ + cycle), so a hostile peer can stall bootstrap indefinitely" + ); + assert_eq!( + state.capacity_rejected_sources.len(), + 1, + "only the attacker is outstanding; honest peers are unaffected — \ + which is exactly what makes this a single-peer DoS" + ); +} + +/// Honest peers are unaffected: the per-source quota means a flood from +/// the attacker cannot starve an honest peer's hints. The honest peer's +/// "clean" cycle correctly clears its bootstrap entry. This test +/// confirms the per-source isolation that the bounded-queues defence +/// (`poc_d1_bounded_queues`) already established — included so a future +/// fix doesn't accidentally break it. +#[tokio::test] +async fn honest_peer_drains_normally_alongside_attacker() { + let queues = Arc::new(RwLock::new(ReplicationQueues::new())); + let bootstrap_state = Arc::new(RwLock::new(BootstrapState::new())); + + let attacker = peer(0xAA); + let honest = peer(0x01); + + // Attacker over-caps. + #[allow(clippy::cast_possible_truncation)] + let flood = MAX_PENDING_VERIFY_PER_PEER as u32 + 1; + let r_atk = simulate_inbound_sync(&queues, &bootstrap_state, attacker, 0, flood).await; + assert!(r_atk >= 1); + + // Honest peer sends a small clean batch. + let r_honest = simulate_inbound_sync(&queues, &bootstrap_state, honest, flood + 100, 16).await; + assert_eq!( + r_honest, 0, + "honest peer's small batch must NOT capacity-reject — per-source quota isolates them" + ); + + let state = bootstrap_state.read().await; + assert!( + state.capacity_rejected_sources.contains(&attacker), + "attacker is outstanding" + ); + assert!( + !state.capacity_rejected_sources.contains(&honest), + "honest peer is NOT outstanding; its clean cycle cleared (or never created) its entry" + ); +} diff --git a/tests/poc_commitment_audit_attacks.rs b/tests/poc_commitment_audit_attacks.rs new file mode 100644 index 00000000..f3bf2ffb --- /dev/null +++ b/tests/poc_commitment_audit_attacks.rs @@ -0,0 +1,909 @@ +//! Threat-model proof-of-concept tests for the gossip-triggered +//! contiguous-subtree storage audit (ADR-0002, +//! `docs/adr/ADR-0002-gossip-triggered-contiguous-subtree-audit.md`). +//! +//! Each test models a specific storage-binding attack from the security +//! review that motivated ADR-0002 and asserts that the subtree-audit +//! mechanisms reject it. This file is the single canonical place to look for +//! "does the subtree audit actually close the storage-binding holes?" — each +//! `#[test]` docstring describes the attack it closes. +//! +//! ## How the auditor is modelled here +//! +//! The production auditor's `verify_subtree_response` (in +//! `src/replication/storage_commitment_audit.rs`) is private, so this file +//! reproduces the exact ordered gates it runs — pin, peer-id binding, +//! signature, structural [`verify_subtree_proof`], then the **round-2 byte +//! challenge**: the auditor demands the ORIGINAL chunk bytes for a +//! nonce-selected sample of the just-proven leaves FROM THE RESPONDER and +//! verifies the served content against each leaf's committed `bytes_hash` +//! (content address) and `nonced_hash` (freshness). Possession is +//! non-delegable: the auditor needs to hold NONE of the responder's chunks, +//! and a committed key the responder cannot serve is a deterministic, +//! confirmed failure (`DigestMismatch` in production — never inconclusive, +//! never graced). The helper [`auditor_accepts`] runs these gates in the same +//! order with the same failure semantics, so a reviewer can see each attack +//! is caught at the same gate the network code would catch it. +//! +//! ## What changed from the old per-key audit (and why) +//! +//! The OLD audit named individual keys and sampled a per-key Merkle inclusion +//! proof + digest. The subtree audit names NO keys: the nonce alone selects one +//! contiguous subtree, the responder must expand it in full, and a few leaves +//! are byte-checked. Consequently these per-key-only attacks were DROPPED — they +//! have no analogue under subtree sampling: +//! +//! * "key not in commitment" / overclaim-via-partial-commitment — the auditor +//! never names a key, so a responder can't be asked to prove an uncommitted +//! key; it proves whatever the nonce selects from its own committed tree. +//! * per-key digest order / per-key path tamper — replaced by the subtree +//! structural checks (leaf count, ascending order, cut-hash count, root +//! rebuild) and the per-leaf real-bytes spot-check. +//! * `RecentProvers` holder-credit revocation/rotation tests — those exercised +//! the cache binding, not the audit proof, and now live with the cache; the +//! subtree auditor credits per proven leaf (`AuditCredit`) but the credit +//! binding itself is unchanged and tested elsewhere. +//! +//! Attacks PRESERVED in spirit, ported to the subtree model: fresh-commitment +//! substitution, cross-peer commitment substitution, throwaway-key +//! substitution, wrong-signer, replay-under-fresh-nonce, repudiation of a +//! recently gossiped pin, and the lazy/relay "holds addresses not bytes" +//! fabricated-possession attack. Plus subtree-native structural attacks: +//! tampered cut-hash, wrong leaf count, reordered leaves. + +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::missing_panics_doc, + clippy::redundant_clone, + clippy::cast_possible_truncation, + clippy::doc_markdown, + clippy::needless_borrows_for_generic_args +)] + +use ant_node::replication::commitment::{ + commitment_hash, leaf_hash, sign_commitment, verify_commitment_signature, MerkleTree, + StorageCommitment, +}; +use ant_node::replication::commitment_state::{BuiltCommitment, ResponderCommitmentState}; +use ant_node::replication::config::AUDIT_SPOTCHECK_COUNT; +use ant_node::replication::subtree::{ + build_subtree_proof, nonced_leaf_hash, select_spotcheck_indices, select_subtree_path, + verify_subtree_proof, StructureVerdict, SubtreeProof, +}; +use saorsa_pqc::api::sig::{ml_dsa_65, MlDsaPublicKey, MlDsaSecretKey}; + +// --------------------------------------------------------------------------- +// Fixtures +// --------------------------------------------------------------------------- + +fn keypair() -> (MlDsaPublicKey, MlDsaSecretKey) { + ml_dsa_65().generate_keypair().unwrap() +} + +/// Deterministic chunk bytes for key index `i`. The committed tree is built +/// from `BLAKE3(content(i))`, so an honest proof — which hashes the same bytes — +/// reconstructs the committed root and passes the real-bytes spot-check. +fn content(i: u32) -> Vec { + let mut v = key(i).to_vec(); + v.extend_from_slice(b"subtree-audit-chunk-body"); + v.extend_from_slice(&i.to_le_bytes()); + v +} + +fn content_hash(i: u32) -> [u8; 32] { + *blake3::hash(&content(i)).as_bytes() +} + +/// Big-endian key so numeric order matches the MerkleTree sort order; this lets +/// us reason about leaf positions when we tamper with them. +fn key(i: u32) -> [u8; 32] { + let mut k = [0u8; 32]; + k[..4].copy_from_slice(&i.to_be_bytes()); + k +} + +/// A responder identity (real ML-DSA keypair) plus its retention state. Peer +/// identity is derived from the public key exactly as in production +/// (saorsa-core `peer_id_from_public_key` = `BLAKE3(pubkey_bytes)`). +struct Responder { + state: ResponderCommitmentState, + public_key: MlDsaPublicKey, + secret_key: MlDsaSecretKey, + peer_id_bytes: [u8; 32], +} + +impl Responder { + fn new() -> Self { + let (public_key, secret_key) = keypair(); + let peer_id_bytes = *blake3::hash(&public_key.to_bytes()).as_bytes(); + Self { + state: ResponderCommitmentState::new(), + public_key, + secret_key, + peer_id_bytes, + } + } + + /// Commit to keys `[0, n)` and rotate that commitment into `current`. + /// Returns the new commitment hash. + fn commit_to_range(&self, n: u32) -> [u8; 32] { + let entries: Vec<_> = (0..n).map(|i| (key(i), content_hash(i))).collect(); + let built = BuiltCommitment::build( + entries, + &self.peer_id_bytes, + &self.secret_key, + &self.public_key.to_bytes(), + ) + .unwrap(); + let h = built.hash(); + self.state.rotate(built); + h + } +} + +/// Bytes source for an HONEST responder: it really holds every chunk it +/// committed to, so it can always produce a correct `nonced_hash`. +fn honest_bytes(k: &[u8; 32]) -> Option> { + for i in 0..4096u32 { + if &key(i) == k { + return Some(content(i)); + } + } + None +} + +/// The auditor's full ordered verification, mirroring the production +/// `verify_subtree_response` gates. Returns `Ok(byte_checked_count)` on accept. +/// +/// `responder_serves(k)` models round 2 (`SubtreeByteChallenge`): what the +/// RESPONDER returns when the auditor demands the original bytes of sampled +/// leaf `k`. `Some(bytes)` is a `SubtreeByteItem::Present`; `None` is an +/// explicit `Absent` or an omitted key — a committed key the responder will +/// not serve, which production `verify_byte_response` counts as a confirmed +/// `DigestMismatch`. The auditor verifies the SERVED content, so it needs to +/// hold none of the responder's chunks and no inconclusive lane exists. +fn auditor_accepts( + challenged_peer_id: &[u8; 32], + expected_commitment_hash: &[u8; 32], + nonce: &[u8; 32], + commitment: &StorageCommitment, + proof: &SubtreeProof, + responder_serves: impl Fn(&[u8; 32]) -> Option>, +) -> Result { + // -- Gate: pin + peer-id binding + signature ---------------------------- + if commitment.sender_peer_id != *challenged_peer_id { + return Err(AuditError::SenderPeerIdMismatch); + } + let derived = *blake3::hash(&commitment.sender_public_key).as_bytes(); + if derived != commitment.sender_peer_id { + return Err(AuditError::PeerIdKeyMismatch); + } + match commitment_hash(commitment) { + Some(h) if &h == expected_commitment_hash => {} + _ => return Err(AuditError::CommitmentHashMismatch), + } + if !verify_commitment_signature(commitment) { + return Err(AuditError::SignatureInvalid); + } + + // -- Gate: structure ---------------------------------------------------- + if let StructureVerdict::Invalid(why) = verify_subtree_proof(proof, nonce, commitment) { + return Err(AuditError::StructureInvalid(why)); + } + + // -- Gate: round-2 byte challenge (responder-served possession) ---------- + // Mirrors `verify_subtree_response` round 2: the sample is nonce-derived + // from the same primitives as production (`AUDIT_SPOTCHECK_COUNT` clamped + // to the auditor's 3..=5 band), and EVERY sampled leaf must verify from + // the bytes the responder serves. There is no skip and no inconclusive + // lane: a committed key the responder cannot serve is a provable lie. + let path = select_subtree_path(nonce, commitment.key_count) + .ok_or(AuditError::StructureInvalid("out-of-protocol key_count"))?; + let spot = select_spotcheck_indices(nonce, &path, AUDIT_SPOTCHECK_COUNT.clamp(3, 5)); + if spot.is_empty() { + // Cannot happen after a valid structure (the subtree is never empty), + // but mirror the production guard: never credit an unproven peer. + return Err(AuditError::StructureInvalid("empty spot-check sample")); + } + let mut checked = 0usize; + for idx in spot { + let leaf = proof + .leaves + .get(idx as usize) + .ok_or(AuditError::StructureInvalid("spot index out of range"))?; + let Some(bytes) = responder_serves(&leaf.key) else { + // Absent/omitted committed key → confirmed failure (production + // maps this to `DigestMismatch`), NOT a skip. + return Err(AuditError::CommittedKeyUnserved); + }; + let plain = *blake3::hash(&bytes).as_bytes(); + let nonced = nonced_leaf_hash(nonce, &commitment.sender_peer_id, &leaf.key, &bytes); + if leaf.bytes_hash != plain || leaf.nonced_hash != nonced { + return Err(AuditError::RealBytesMismatch); + } + checked += 1; + } + Ok(checked) +} + +#[derive(Debug, PartialEq, Eq)] +enum AuditError { + SenderPeerIdMismatch, + PeerIdKeyMismatch, + CommitmentHashMismatch, + SignatureInvalid, + StructureInvalid(&'static str), + /// Round 2: the responder served content that does not hash to the + /// committed address / freshness hash (production: `DigestMismatch`). + RealBytesMismatch, + /// Round 2: the responder would not serve a committed, sampled key + /// (production: `DigestMismatch` — a deterministic, confirmed failure). + CommittedKeyUnserved, +} + +/// Build an honest subtree proof for `nonce` against the responder's current +/// committed tree, returning `(proof, commitment)` as the auditor would receive +/// them in a `SubtreeAuditResponse::Proof`. +fn honest_proof_and_commitment( + r: &Responder, + nonce: &[u8; 32], +) -> (SubtreeProof, StorageCommitment) { + let built = r.state.current().unwrap(); + let proof = build_subtree_proof(built.tree(), nonce, &r.peer_id_bytes, honest_bytes).unwrap(); + (proof, built.commitment().clone()) +} + +// --------------------------------------------------------------------------- +// Sanity: the honest path the attack tests are measured against actually passes +// --------------------------------------------------------------------------- + +/// Anchor: an honest responder that committed to its keys and still holds the +/// bytes produces a proof the (modelled) auditor accepts. Without this, the +/// rejection assertions below could pass vacuously. +#[test] +fn honest_responder_passes_audit() { + let nonce = [0xCD; 32]; + let honest = Responder::new(); + let pin = honest.commit_to_range(64); + let (proof, commitment) = honest_proof_and_commitment(&honest, &nonce); + + let res = auditor_accepts( + &honest.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert!(res.is_ok(), "honest path must pass, got {res:?}"); + assert!(res.unwrap() >= 1, "must byte-check at least one leaf"); +} + +// --------------------------------------------------------------------------- +// Storage-binding path A: lazy/relay node holds chunk ADDRESSES, not bytes +// --------------------------------------------------------------------------- + +/// Attack 1a (path A) — the storage-binding heart of the subtree +/// audit. A lazy/relay node retained the gossiped commitment and knows every +/// leaf's `bytes_hash` (that value IS the chunk's network address, which is +/// public), but it DROPPED the actual bytes. It fabricates a proof: correct +/// `key` and correct `bytes_hash` for every selected leaf (so the structural +/// root rebuild passes), but it cannot compute the `nonced_hash`, which requires +/// the real bytes under a fresh nonce. It fills in a forged `nonced_hash`. +/// +/// The structural gate PASSES (addresses alone rebuild the root), proving that +/// structure is NOT sufficient — exactly the storage-binding hole. Round 2 is what +/// catches it: the auditor demands the original bytes FROM THE RELAY, and the +/// relay has nothing to serve. Refusing/omitting a sampled committed key is a +/// confirmed failure, and serving fabricated bytes cannot hash to the +/// committed content address (a preimage break) — both lanes are asserted. +#[test] +fn relay_holding_only_addresses_caught_by_real_bytes_check() { + let nonce = [0x77; 32]; + let honest_keyset = Responder::new(); + let pin = honest_keyset.commit_to_range(100); + let built = honest_keyset.state.current().unwrap(); + + // The lazy node fabricates the proof from PUBLIC data only: it knows each + // leaf key and its bytes_hash (== address), but NOT the bytes, so it forges + // every nonced_hash. + let path = select_subtree_path(&nonce, built.commitment().key_count).unwrap(); + let mut leaves = Vec::new(); + for idx in path.leaf_start..path.leaf_end { + let k = built.tree().key_at(idx as usize).unwrap(); + // bytes_hash is public (== the chunk address); the responder fakes the + // possession hash because it lacks the bytes. + let forged_nonced = *blake3::hash(b"i-do-not-have-the-bytes").as_bytes(); + leaves.push(ant_node::replication::subtree::SubtreeLeaf { + key: k, + bytes_hash: content_hash(idx), + nonced_hash: forged_nonced, + }); + } + // Real sibling cut-hashes from the committed tree (public, derivable). + let plan = ant_node::replication::subtree::subtree_plan(built.tree(), &nonce).unwrap(); + let forged = SubtreeProof { + leaves, + sibling_cut_hashes: plan.sibling_cut_hashes, + }; + + // Structure alone PASSES — addresses are enough to rebuild the root. This + // is the precise reason structure is insufficient on its own. + assert_eq!( + verify_subtree_proof(&forged, &nonce, built.commitment()), + StructureVerdict::Valid, + "address-only proof rebuilds the root (structure cannot bind possession)" + ); + + // Lane 1: the relay cannot serve the sampled bytes (it holds none). An + // unserved committed key is a deterministic, confirmed failure. + let res = auditor_accepts( + &honest_keyset.peer_id_bytes, + &pin, + &nonce, + built.commitment(), + &forged, + |_k| None, // the relay has no bytes to serve + ); + assert_eq!( + res, + Err(AuditError::CommittedKeyUnserved), + "a relay that cannot serve sampled bytes must fail round 2, got {res:?}" + ); + + // Lane 2: the relay serves fabricated bytes instead. They cannot hash to + // the committed content address, so the served-content check catches it. + let res = auditor_accepts( + &honest_keyset.peer_id_bytes, + &pin, + &nonce, + built.commitment(), + &forged, + |_k| Some(b"fabricated-not-the-chunk".to_vec()), + ); + assert_eq!( + res, + Err(AuditError::RealBytesMismatch), + "fabricated served bytes must fail the content-address check, got {res:?}" + ); +} + +/// Attack 1a, detection-probability framing: a responder that fabricates a +/// FRACTION of leaves (holds some bytes, forged the rest) survives one audit +/// only with probability `(1 - x)^k` over `k` byte-challenged leaves. This pins +/// that any sampled leaf it forged is fatal — the responder cannot predict +/// which leaves are sampled, because the spot-check indices are derived from +/// the same nonce that fixes the whole proof, and round 2 re-derives the +/// freshness hash from the bytes it is forced to serve. +#[test] +fn fabricated_fraction_is_caught_when_a_forged_leaf_is_sampled() { + let nonce = [0x31; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(400); + let (mut proof, commitment) = honest_proof_and_commitment(&r, &nonce); + + // Forge the nonced hash on every byte-challenged position (worst case for + // the attacker: all sampled leaves are fabricated → guaranteed catch). The + // sample expression matches the auditor's exactly. + let path = select_subtree_path(&nonce, commitment.key_count).unwrap(); + for idx in select_spotcheck_indices(&nonce, &path, AUDIT_SPOTCHECK_COUNT.clamp(3, 5)) { + if let Some(leaf) = proof.leaves.get_mut(idx as usize) { + leaf.nonced_hash[0] ^= 0xFF; + } + } + + // Even if the responder serves the REAL bytes in round 2, the freshness + // hash recomputed from that served content exposes the forged leaf. + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert_eq!( + res, + Err(AuditError::RealBytesMismatch), + "a forged leaf landing under the byte challenge must fail, got {res:?}" + ); +} + +/// Attack 1a, non-delegable possession (the lane that replaced "inconclusive"): +/// a relay returns a structurally-valid, address-only proof and the AUDITOR +/// holds none of the chunks — the pre-ADR-0002 design had to call this +/// inconclusive because it byte-checked against the auditor's own copies. +/// Under the shipped two-round audit there is no such lane: round 2 demands +/// the bytes from the RESPONDER, so auditor overlap is irrelevant and a relay +/// that cannot serve its committed bytes fails DETERMINISTICALLY (a confirmed +/// failure in production, not idle/inconclusive, and never a free pass). +#[test] +fn relay_unable_to_serve_bytes_fails_deterministically_regardless_of_auditor_overlap() { + let nonce = [0x19; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(100); + // Honest structure (real bytes existed at commit time), so round 1 passes; + // the point is the responder dropped the bytes and cannot serve them. + let (proof, commitment) = honest_proof_and_commitment(&r, &nonce); + + let relay_serves_nothing = |_k: &[u8; 32]| -> Option> { None }; + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + relay_serves_nothing, + ); + assert_eq!( + res, + Err(AuditError::CommittedKeyUnserved), + "an unserved sampled key ⇒ deterministic confirmed failure, got {res:?}" + ); +} + +// --------------------------------------------------------------------------- +// Storage-binding path B: fresh-commitment substitution +// --------------------------------------------------------------------------- + +/// Attack 1b (path B): a responder builds a FRESH commitment over a +/// different key set and answers with a valid proof against THAT commitment, +/// while the auditor pinned the hash of the commitment the peer actually +/// gossiped. The auditor's pin (`commitment_hash == expected_commitment_hash`) +/// rejects the substitution before any structural work. +#[test] +fn fresh_commitment_substitution_rejected_by_pin() { + let nonce = [0xCD; 32]; + + let original = Responder::new(); + let pinned_hash = original.commit_to_range(64); + + // Same peer rotates to a fresh commitment over a different range; it can + // build a perfectly valid proof against the NEW commitment. + let fresh_hash = original.commit_to_range(32); + assert_ne!(pinned_hash, fresh_hash); + let (proof, fresh_commitment) = honest_proof_and_commitment(&original, &nonce); + + // Auditor still pins the ORIGINAL hash. + let res = auditor_accepts( + &original.peer_id_bytes, + &pinned_hash, // <- original pin, not fresh_hash + &nonce, + &fresh_commitment, + &proof, + honest_bytes, + ); + assert_eq!( + res, + Err(AuditError::CommitmentHashMismatch), + "fresh-commitment substitution must trip the pin, got {res:?}" + ); +} + +// --------------------------------------------------------------------------- +// Storage-binding path C: cross-peer commitment substitution +// --------------------------------------------------------------------------- + +/// Attack 1c (peer impersonation): peer Q lifts peer P's signed +/// commitment from gossip and embeds it in its own response, hoping the auditor +/// verifies P's signature by mistake. The auditor binds the commitment's +/// `sender_peer_id` to the challenged peer; the stolen commitment names P, not +/// Q, so it is rejected before any signature/structure work. +#[test] +fn cross_peer_commitment_substitution_rejected_by_sender_id() { + let nonce = [0xCD; 32]; + + let real_p = Responder::new(); + let p_hash = real_p.commit_to_range(64); + let (p_proof, p_commitment) = honest_proof_and_commitment(&real_p, &nonce); + + // Auditor is challenging Q (a different peer id) but somehow holds p_hash in + // its pin (modelling a mis-binding); Q replays P's commitment + proof. + let q_peer_id = [0xCC; 32]; + let res = auditor_accepts( + &q_peer_id, // challenged peer is Q + &p_hash, + &nonce, + &p_commitment, // sender_peer_id == P, not Q + &p_proof, + honest_bytes, + ); + assert_eq!( + res, + Err(AuditError::SenderPeerIdMismatch), + "cross-peer substitution must trip the sender-id binding, got {res:?}" + ); +} + +/// Attack 1c': throwaway-key substitution. An adversary wants to answer as peer +/// P (whose pubkey it does NOT control). It builds a commitment naming P's +/// peer_id but embedding a throwaway pubkey it can sign with — the signature +/// verifies under the embedded key. The peer-id↔key binding +/// (`peer_id == BLAKE3(embedded_pubkey)`) rejects it: the embedded throwaway key +/// does not hash to P's peer_id. +#[test] +#[allow(clippy::similar_names)] +fn throwaway_key_substitution_rejected_by_pubkey_binding() { + let nonce = [0xCD; 32]; + + // P's real identity (adversary does not hold P's secret key). + let (p_pubkey, _p_secret) = keypair(); + let p_peer_id = *blake3::hash(&p_pubkey.to_bytes()).as_bytes(); + + // Adversary's throwaway keypair. + let (throwaway_pk, throwaway_sk) = keypair(); + let throwaway_pk_bytes = throwaway_pk.to_bytes(); + + // Build a commitment naming P's peer_id but embedding+signing with the + // throwaway key. + let entries: Vec<_> = (0..8u32).map(|i| (key(i), content_hash(i))).collect(); + let tree = MerkleTree::build(entries).unwrap(); + let root = tree.root(); + let key_count = tree.key_count(); + let sig = sign_commitment( + &throwaway_sk, + &root, + key_count, + &p_peer_id, // claims P (the lie) + &throwaway_pk_bytes, + ) + .unwrap(); + let bad_commit = StorageCommitment { + root, + key_count, + sender_peer_id: p_peer_id, + sender_public_key: throwaway_pk_bytes, + signature: sig, + }; + let pin = commitment_hash(&bad_commit).unwrap(); + + // A perfectly valid proof against the bad commitment's own tree. + let proof = build_subtree_proof(&tree, &nonce, &p_peer_id, honest_bytes).unwrap(); + + let res = auditor_accepts(&p_peer_id, &pin, &nonce, &bad_commit, &proof, honest_bytes); + assert_eq!( + res, + Err(AuditError::PeerIdKeyMismatch), + "throwaway-key attack must trip the peer-id↔key binding, got {res:?}" + ); +} + +/// Attack 1c'' — wrong signer at the signature gate. To isolate the signature +/// gate from the bindings above, the adversary swaps BOTH the embedded pubkey +/// and the sender_peer_id to a consistent (wrong) identity, and re-pins the +/// auditor to the mutated commitment. Now the peer-id binding and pin pass, but +/// the signature was produced under the ORIGINAL secret key over the ORIGINAL +/// payload — it cannot verify under the swapped key. +#[test] +fn wrong_signer_rejected_at_signature_gate() { + let nonce = [0xCD; 32]; + + let responder = Responder::new(); + responder.commit_to_range(16); + let (proof, commitment) = honest_proof_and_commitment(&responder, &nonce); + + let (wrong_pk, _wrong_sk) = keypair(); + let wrong_pk_bytes = wrong_pk.to_bytes(); + let wrong_peer_id = *blake3::hash(&wrong_pk_bytes).as_bytes(); + + let mut bad_commit = commitment.clone(); + bad_commit.sender_public_key = wrong_pk_bytes; + bad_commit.sender_peer_id = wrong_peer_id; + let new_pin = commitment_hash(&bad_commit).unwrap(); + + // The proof's leaves bind the ORIGINAL peer_id in their nonced hashes, but + // the signature gate fires BEFORE the structural/real-bytes gates, so it is + // the first (and asserted) failure. + let res = auditor_accepts( + &wrong_peer_id, + &new_pin, + &nonce, + &bad_commit, + &proof, + honest_bytes, + ); + assert_eq!( + res, + Err(AuditError::SignatureInvalid), + "swapped embedded key must trip the signature gate, got {res:?}" + ); +} + +// --------------------------------------------------------------------------- +// Storage-binding path D: replay an old response under a fresh nonce +// --------------------------------------------------------------------------- + +/// Attack 1d (replay): the auditor issues a fresh nonce each audit. +/// The nonce both selects the subtree AND freshens every leaf's possession hash, +/// so a response captured under an old nonce cannot be replayed: the new nonce +/// selects a different subtree (wrong leaf set / cut-hash count) and the stale +/// nonced hashes no longer match. Asserts the structural gate alone already +/// rejects the stale proof under the new nonce. +#[test] +fn audit_response_replay_blocked_by_fresh_nonce() { + let old_nonce = [0xCD; 32]; + let fresh_nonce = [0xEF; 32]; + + let r = Responder::new(); + let pin = r.commit_to_range(256); + let (stale_proof, commitment) = honest_proof_and_commitment(&r, &old_nonce); + + // Sanity: the stale proof was valid under its own (old) nonce. + assert_eq!( + verify_subtree_proof(&stale_proof, &old_nonce, &commitment), + StructureVerdict::Valid + ); + + // Replayed verbatim under the fresh nonce, it fails — the new nonce selects + // a different subtree, so even the structure no longer reconstructs. + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &fresh_nonce, // <- different nonce + &commitment, + &stale_proof, + honest_bytes, + ); + assert!( + matches!(res, Err(AuditError::StructureInvalid(_))), + "replay under a fresh nonce must fail the structural gate, got {res:?}" + ); +} + +// --------------------------------------------------------------------------- +// Subtree-native structural attacks (replace the old per-key path/order tamper) +// --------------------------------------------------------------------------- + +/// Tampering a sibling cut-hash breaks the root rebuild. (Subtree analogue of +/// the old per-key "tamper the inclusion path" attack.) +#[test] +fn tampered_cut_hash_rejected() { + let nonce = [0x0B; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(256); + let (mut proof, commitment) = honest_proof_and_commitment(&r, &nonce); + assert!( + !proof.sibling_cut_hashes.is_empty(), + "a 256-leaf tree selects a deep subtree with cut-hashes" + ); + if let Some(c) = proof.sibling_cut_hashes.first_mut() { + c[0] ^= 0x01; + } + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert!( + matches!(res, Err(AuditError::StructureInvalid(_))), + "tampered cut-hash must fail structure, got {res:?}" + ); +} + +/// Dropping a leaf yields the wrong leaf count for the agreed subtree. The +/// auditor re-derives the exact expected count from `(nonce, key_count)` and +/// rejects. +#[test] +fn wrong_leaf_count_rejected() { + let nonce = [0x0C; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(100); + let (mut proof, commitment) = honest_proof_and_commitment(&r, &nonce); + proof.leaves.pop(); + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert_eq!( + res, + Err(AuditError::StructureInvalid("wrong leaf count")), + "dropped leaf must fail the leaf-count check, got {res:?}" + ); +} + +/// Reordering leaves violates the strict ascending-key order the committed tree +/// enforces (and would otherwise let a responder shuffle leaves to dodge the +/// spot-check). Rejected structurally. +#[test] +fn reordered_leaves_rejected() { + let nonce = [0x0D; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(100); + let (mut proof, commitment) = honest_proof_and_commitment(&r, &nonce); + assert!(proof.leaves.len() >= 2); + proof.leaves.swap(0, 1); + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert!( + matches!(res, Err(AuditError::StructureInvalid(_))), + "reordered leaves must fail structure, got {res:?}" + ); +} + +/// Tampering a leaf's `bytes_hash` (claiming a different chunk at a committed +/// position) breaks the root rebuild — the leaf hash binds (key, bytes_hash). +#[test] +fn tampered_leaf_bytes_hash_rejected() { + let nonce = [0x0E; 32]; + let r = Responder::new(); + let pin = r.commit_to_range(100); + let (mut proof, commitment) = honest_proof_and_commitment(&r, &nonce); + proof.leaves[0].bytes_hash[0] ^= 0x01; + let res = auditor_accepts( + &r.peer_id_bytes, + &pin, + &nonce, + &commitment, + &proof, + honest_bytes, + ); + assert!( + matches!(res, Err(AuditError::StructureInvalid(_))), + "tampered bytes_hash must fail structure, got {res:?}" + ); +} + +// --------------------------------------------------------------------------- +// Repudiation: rejecting a recently-gossiped pinned commitment +// --------------------------------------------------------------------------- + +/// Attack: a responder repudiates a commitment it just gossiped — it answers a +/// pin for a commitment it no longer retains. Because the auditor only ever pins +/// a commitment the peer JUST gossiped, and an honest responder retains its last +/// two GOSSIPED commitments, a `lookup_by_hash` miss for a gossiped pin is a +/// confirmed failure. This test pins the retention contract: a gossiped pin +/// stays answerable across the next rotation, but a NEVER-gossiped commitment is +/// dropped on the next rotation (so the responder rightly cannot answer a pin it +/// never put on the wire). +#[test] +fn repudiating_a_gossiped_pin_is_detectable_via_lookup_miss() { + let r = Responder::new(); + let state = &r.state; + + // c1 is gossiped → must stay answerable across one rotation. + let h1 = r.commit_to_range(8); + state.mark_gossiped(h1); + assert!( + state.lookup_by_hash(&h1).is_some(), + "gossiped pin must be answerable immediately" + ); + + // Rotate + gossip c2. c1 is within the last-2-gossiped window → still here. + let h2 = r.commit_to_range(16); + state.mark_gossiped(h2); + assert!( + state.lookup_by_hash(&h1).is_some(), + "a gossiped commitment must survive one rotation (no false repudiation)" + ); + + // Rotate + gossip c3. Now the last-2-gossiped are {h3, h2}; h1 has aged out + // and is legitimately dropped (the auditor would no longer pin it). + let h3 = r.commit_to_range(24); + state.mark_gossiped(h3); + assert!( + state.lookup_by_hash(&h1).is_none(), + "h1 aged out of the gossip window" + ); + assert!(state.lookup_by_hash(&h2).is_some()); + assert!(state.lookup_by_hash(&h3).is_some()); + + // The detection edge: a commitment that was NEVER gossiped is dropped on the + // very next rotation, so a responder asked to answer a pin for an + // ungossiped-then-rotated commitment returns a lookup MISS — which the + // auditor (since it only pins gossiped roots) reads as repudiation. + let r2 = Responder::new(); + let ungossiped = r2.commit_to_range(8); + assert!(r2.state.lookup_by_hash(&ungossiped).is_some()); + let _next = r2.commit_to_range(16); // rotate without gossiping `ungossiped` + assert!( + r2.state.lookup_by_hash(&ungossiped).is_none(), + "an ungossiped commitment is dropped on the next rotation" + ); +} + +// --------------------------------------------------------------------------- +// Cross-check lemmas: the primitives the rejection tests rest on +// --------------------------------------------------------------------------- + +/// The commitment-hash pin is sensitive to every field. This underwrites every +/// "pin doesn't match" assertion above. +#[test] +fn commitment_hash_is_field_sensitive() { + let (pk, sk) = keypair(); + let pk_bytes = pk.to_bytes(); + let sig = sign_commitment(&sk, &[0; 32], 1, &[0; 32], &pk_bytes).unwrap(); + let c1 = StorageCommitment { + root: [0; 32], + key_count: 1, + sender_peer_id: [0; 32], + sender_public_key: pk_bytes, + signature: sig, + }; + let h1 = commitment_hash(&c1).unwrap(); + + for mutate in 0..5u8 { + let mut c = c1.clone(); + match mutate { + 0 => c.root[0] ^= 1, + 1 => c.key_count += 1, + 2 => c.sender_peer_id[0] ^= 1, + 3 => c.signature[0] ^= 1, + 4 => c.sender_public_key[0] ^= 1, + _ => unreachable!(), + } + let h = commitment_hash(&c).unwrap(); + assert_ne!(h, h1, "mutation {mutate} should change commitment_hash"); + } +} + +/// The leaf hash binds (key, bytes_hash): same key + different bytes → different +/// leaf → different root. Underwrites the structural rejections. +#[test] +fn leaf_hash_binds_key_and_bytes() { + let h1 = leaf_hash(&key(1), &content_hash(1)); + let h2 = leaf_hash(&key(1), &content_hash(2)); + let h3 = leaf_hash(&key(2), &content_hash(1)); + assert_ne!(h1, h2); + assert_ne!(h1, h3); + assert_ne!(h2, h3); +} + +/// The signature verifies under the embedded key and only that key. +#[test] +fn signature_round_trips_correctly() { + let (pk1, sk1) = keypair(); + let (pk2, _sk2) = keypair(); + let pk1_bytes = pk1.to_bytes(); + let pk2_bytes = pk2.to_bytes(); + let sig = sign_commitment(&sk1, &[7; 32], 42, &[3; 32], &pk1_bytes).unwrap(); + let c = StorageCommitment { + root: [7; 32], + key_count: 42, + sender_peer_id: [3; 32], + sender_public_key: pk1_bytes, + signature: sig, + }; + assert!(verify_commitment_signature(&c)); + let mut c2 = c.clone(); + c2.sender_public_key = pk2_bytes; + assert!(!verify_commitment_signature(&c2)); +} + +/// The per-leaf possession hash binds nonce, peer, key, and bytes — the +/// foundation of the real-bytes spot-check. Changing any input changes it, so a +/// responder cannot reuse a possession hash across nonces/peers/keys/chunks. +#[test] +fn nonced_leaf_hash_binds_all_inputs() { + let base = nonced_leaf_hash(&[1; 32], &[2; 32], &key(3), b"chunk"); + assert_ne!( + base, + nonced_leaf_hash(&[9; 32], &[2; 32], &key(3), b"chunk") + ); + assert_ne!( + base, + nonced_leaf_hash(&[1; 32], &[9; 32], &key(3), b"chunk") + ); + assert_ne!( + base, + nonced_leaf_hash(&[1; 32], &[2; 32], &key(9), b"chunk") + ); + assert_ne!( + base, + nonced_leaf_hash(&[1; 32], &[2; 32], &key(3), b"other") + ); +}