Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
6e094f4
[fm] Add disk diagnoser with typed fact tables
smklein Jun 2, 2026
bfedaa5
better errors
smklein Jun 3, 2026
1b5106a
clearer constraints, a non-null column for physical_id
smklein Jun 3, 2026
dfab3cf
[fm] Add saga diagnosis engine
smklein Jun 4, 2026
ae1ec41
merge
smklein Jun 10, 2026
bfb6a90
merge
smklein Jun 10, 2026
4b665e7
PR feedback: growable fact CHECK constraint, filter deleted disks, to…
smklein Jun 10, 2026
272b368
[fm] diagnosis engines read their Input from the SitrepBuilder
smklein Jun 10, 2026
14c2a06
drop unused lookup_fm_fact_physical_disk_for_case index
smklein Jun 10, 2026
d2a6767
[fm] disk diagnoser: index parent cases by disk
smklein Jun 10, 2026
1def948
merge
smklein Jun 10, 2026
89c04db
[fm] fm_fact_saga: per-kind implication CHECK constraints
smklein Jun 10, 2026
bb308b5
[fm] saga diagnoser: close cases on recovery
smklein Jun 10, 2026
3c1f976
drop unused lookup_fm_fact_saga_for_case index
smklein Jun 10, 2026
5b18ede
[fm] include observed sagas in the analysis input report
smklein Jun 10, 2026
c2ebef7
[fm] chunk saga_latest_node_event_times by SQL_BATCH_SIZE
smklein Jun 10, 2026
28fb5fa
[fm] saga diagnoser: remove duplicate facts of the same kind
smklein Jun 10, 2026
f799268
[fm] fix SagaOwnerState::Absent doc
smklein Jun 10, 2026
7b245fc
[fm] saga facts: payloads carry only condition-defining fields
smklein Jun 10, 2026
027622a
[fm] saga diagnoser: Abandoned fact kind
smklein Jun 11, 2026
82ef44f
simplify comments
smklein Jun 11, 2026
6f2e4a5
[fm] disk diagnoser: close uninterpretable cases; deflake omdb test
smklein Jun 11, 2026
cebfc8c
merge
smklein Jun 11, 2026
36e39ce
merge
smklein Jun 11, 2026
ee63736
[fm] saga diagnoser: close uninterpretable and duplicate cases
smklein Jun 11, 2026
83c73ad
[fm] derive fact row kind from payload via exhaustive match
smklein Jun 12, 2026
546fc5d
[fm] tidy disk diagnoser per review feedback
smklein Jun 15, 2026
2718755
[fm] split Fact into FactMetadata + payload
smklein Jun 15, 2026
0ada45d
[fm] disk diagnoser: clearer names for parsing a case
smklein Jun 15, 2026
39e4228
[fm] disk diagnoser: build case_by_disk in a single pass
smklein Jun 15, 2026
4b6e928
[fm] disk diagnoser: separate case-closing from fact reconciliation
smklein Jun 15, 2026
e731c5d
[fm] disk diagnoser tests: name the disk-fact triple, dedup fact-id l…
smklein Jun 16, 2026
f9479fc
[fm] address review feedback from eliza on #10541
smklein Jun 16, 2026
221dcc2
merge
smklein Jun 16, 2026
6c0adbe
merge fm-disk-diagnoser-typed into fm-saga-diagnoser
smklein Jun 16, 2026
5b48c08
[fm] saga diagnoser: apply lessons from disk diagnoser (#10541)
smklein Jun 16, 2026
3c5545d
merge
smklein Jun 16, 2026
dffc15c
merge
smklein Jun 16, 2026
0604e5e
[fm] omdb test: redact racy fm_sitrep_gc orphan counts
smklein Jun 16, 2026
ea79c0a
merge
smklein Jun 16, 2026
af8bd15
merge
smklein Jun 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,8 @@ task: "fm_analysis"

no in-service control plane disks

no non-terminal sagas observed

analysis report:
sitrep ID: ..........<REDACTED_UUID>...........
no cases changed in this analysis step
Expand Down Expand Up @@ -765,6 +767,8 @@ task: "fm_sitrep_gc"
batches: 1
orphaned fm_fact_physical_disk rows deleted: 0
batches: 1
orphaned fm_fact_saga rows deleted: 0
batches: 1
orphaned fm_sitrep_analysis_report rows deleted: <ORPHANED_FMSITREPANALYSISREPORT_ROWS_DELETED_REDACTED>
batches: 1
orphaned fm_support_bundle_request rows deleted: 0
Expand Down Expand Up @@ -1414,6 +1418,8 @@ task: "fm_analysis"

no in-service control plane disks

no non-terminal sagas observed

analysis report:
sitrep ID: ..........<REDACTED_UUID>...........
no cases changed in this analysis step
Expand Down Expand Up @@ -1463,6 +1469,8 @@ task: "fm_sitrep_gc"
batches: 1
orphaned fm_fact_physical_disk rows deleted: 0
batches: 1
orphaned fm_fact_saga rows deleted: 0
batches: 1
orphaned fm_sitrep_analysis_report rows deleted: <ORPHANED_FMSITREPANALYSISREPORT_ROWS_DELETED_REDACTED>
batches: 1
orphaned fm_support_bundle_request rows deleted: 0
Expand Down
6 changes: 4 additions & 2 deletions nexus/db-model/src/fm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ mod case;
pub use case::*;
mod diagnosis_engine;
pub use diagnosis_engine::*;
mod rendezvous_created;
pub use rendezvous_created::*;
mod fact_physical_disk;
pub use fact_physical_disk::*;
mod fact_saga;
pub use fact_saga::*;
mod rendezvous_created;
pub use rendezvous_created::*;
mod support_bundle_request;
pub use support_bundle_request::*;
mod sitrep_analysis_report;
Expand Down
3 changes: 3 additions & 0 deletions nexus/db-model/src/fm/diagnosis_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ impl_enum_type!(

PowerShelf => b"power_shelf"
PhysicalDisk => b"physical_disk"
Saga => b"saga"

);

Expand All @@ -35,6 +36,7 @@ impl From<DiagnosisEngine> for fm::DiagnosisEngineKind {
DiagnosisEngine::PhysicalDisk => {
fm::DiagnosisEngineKind::PhysicalDisk
}
DiagnosisEngine::Saga => fm::DiagnosisEngineKind::Saga,
}
}
}
Expand All @@ -46,6 +48,7 @@ impl From<fm::DiagnosisEngineKind> for DiagnosisEngine {
fm::DiagnosisEngineKind::PhysicalDisk => {
DiagnosisEngine::PhysicalDisk
}
fm::DiagnosisEngineKind::Saga => DiagnosisEngine::Saga,
}
}
}
Expand Down
235 changes: 235 additions & 0 deletions nexus/db-model/src/fm/fact_saga.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Database representation of the saga diagnosis engine's facts.
//!
//! Each saga fact is stored as typed columns in the `fm_fact_saga` table. The
//! `kind` discriminant selects which payload columns are populated; per-kind
//! CHECK constraints (e.g. `not_progressing_columns_present`) enforce that
//! the right columns are non-NULL for each kind. See
//! [`nexus_types::fm::SagaFact`] for semantics.

use crate::DbTypedUuid;
use crate::SagaState;
use crate::impl_enum_type;
use chrono::{DateTime, Utc};
use nexus_db_schema::schema::fm_fact_saga;
use nexus_types::fm;
use nexus_types::fm::case::FactMetadata;
use nexus_types::fm::{
FactPayload, SagaAbandonedFactPayload, SagaFact,
SagaNotProgressingFactPayload, SagaOwnerNotCurrentFactPayload,
};
use nexus_types::observed_saga::{OrphanedReason, SagaProgressState};
use omicron_common::api::external::Error;
use omicron_uuid_kinds::{CaseKind, FactKind, OmicronZoneKind, SitrepKind};
use uuid::Uuid;

impl_enum_type!(
FmFactSagaKindEnum:

#[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq, Eq)]
pub enum FmFactSagaKind;

NotProgressing => b"not_progressing"
OwnerNotCurrentGeneration => b"owner_not_current_generation"
Abandoned => b"abandoned"
);

impl_enum_type!(
FmFactSagaOrphanReasonEnum:

#[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq, Eq)]
pub enum FmFactSagaOrphanReason;

Quiesced => b"quiesced"
Expunged => b"expunged"
);

impl From<OrphanedReason> for FmFactSagaOrphanReason {
fn from(reason: OrphanedReason) -> Self {
match reason {
OrphanedReason::Quiesced => FmFactSagaOrphanReason::Quiesced,
OrphanedReason::Expunged => FmFactSagaOrphanReason::Expunged,
}
}
}

impl From<FmFactSagaOrphanReason> for OrphanedReason {
fn from(reason: FmFactSagaOrphanReason) -> Self {
match reason {
FmFactSagaOrphanReason::Quiesced => OrphanedReason::Quiesced,
FmFactSagaOrphanReason::Expunged => OrphanedReason::Expunged,
}
}
}

impl From<SagaProgressState> for SagaState {
fn from(state: SagaProgressState) -> Self {
match state {
SagaProgressState::Running => SagaState::Running,
SagaProgressState::Unwinding => SagaState::Unwinding,
}
}
}

/// Convert a DB `saga_state` back into the non-terminal [`SagaProgressState`]
/// recorded on a fact. Terminal states never appear on a saga fact (the case
/// is closed once the saga terminates), so they are treated as corrupt rows.
fn saga_progress_state(state: SagaState) -> Result<SagaProgressState, Error> {
match state {
SagaState::Running => Ok(SagaProgressState::Running),
SagaState::Unwinding => Ok(SagaProgressState::Unwinding),
SagaState::Done | SagaState::Abandoned => {
Err(Error::internal_error(&format!(
"fm_fact_saga row has terminal saga_state {state:?}, which \
should never be recorded on a saga fact"
)))
}
}
}

/// Diesel row for the `fm_fact_saga` table.
///
/// The payload columns are populated according to `kind`: a column is `Some`
/// if it belongs to that `kind`'s payload, and `None` otherwise (by
/// convention; the per-kind CHECK constraints, e.g.
/// `not_progressing_columns_present`, enforce only the "present for the
/// matching kind" direction so that future kinds may share columns).
#[derive(Queryable, Insertable, Clone, Debug, Selectable)]
#[diesel(table_name = fm_fact_saga)]
pub struct FmFactSaga {
pub id: DbTypedUuid<FactKind>,
/// The sitrep to which this fact belongs.
///
/// This will change as the fact is carried forward from one sitrep to the
/// next.
pub sitrep_id: DbTypedUuid<SitrepKind>,
pub case_id: DbTypedUuid<CaseKind>,
/// Sitrep in which this fact was first added.
///
/// Preserved unchanged when the fact is carried forward; debug-only.
pub created_sitrep_id: DbTypedUuid<SitrepKind>,
pub comment: String,

/// The saga this fact is about. Common to every `kind`, so it is always
/// present (the column is `NOT NULL`).
pub saga_id: Uuid,
pub kind: FmFactSagaKind,

// Columns for the `NotProgressing` kind.
pub saga_state: Option<SagaState>,
pub last_event_time: Option<DateTime<Utc>>,

// Columns for the `OwnerNotCurrentGeneration` kind.
pub current_sec: Option<DbTypedUuid<OmicronZoneKind>>,
pub orphan_reason: Option<FmFactSagaOrphanReason>,
}

impl FmFactSaga {
/// Build a row from a fact's shared metadata (`metadata`) and its
/// already-dispatched saga payload (`saga_fact`).
pub fn from_sitrep(
sitrep_id: impl Into<DbTypedUuid<SitrepKind>>,
case_id: impl Into<DbTypedUuid<CaseKind>>,
metadata: &FactMetadata,
saga_fact: &SagaFact,
) -> Self {
// Destructure exhaustively: a new `FactMetadata` field will fail to
// compile here until it is mapped to a column.
let FactMetadata { id, created_sitrep_id, comment } = metadata;
let base = Self {
id: (*id).into(),
sitrep_id: sitrep_id.into(),
case_id: case_id.into(),
created_sitrep_id: (*created_sitrep_id).into(),
comment: comment.clone(),
saga_id: saga_fact.saga_id().0,
kind: FmFactSagaKind::NotProgressing,
saga_state: None,
last_event_time: None,
current_sec: None,
orphan_reason: None,
};
match saga_fact {
SagaFact::NotProgressing(p) => Self {
kind: FmFactSagaKind::NotProgressing,
saga_state: Some(p.saga_state.into()),
last_event_time: Some(p.last_event_time),
..base
},
SagaFact::OwnerNotCurrentGeneration(p) => Self {
kind: FmFactSagaKind::OwnerNotCurrentGeneration,
current_sec: Some(p.current_sec.into()),
orphan_reason: Some(p.orphan_reason.into()),
..base
},
// The Abandoned payload is pure identity (the condition is
// boolean), so the row carries only the common columns.
SagaFact::Abandoned(_) => {
Self { kind: FmFactSagaKind::Abandoned, ..base }
}
}
}

/// Reconstruct an in-memory fact from a row.
///
/// Columns the database's CHECK constraint guarantees are non-NULL for this
/// `kind` are unwrapped; a NULL where one is required indicates a corrupt
/// row and yields an internal error rather than a panic.
pub fn into_fact(self) -> Result<fm::case::Fact, Error> {
let kind = self.kind;
let saga_id = steno::SagaId(self.saga_id);
let payload = match kind {
FmFactSagaKind::NotProgressing => FactPayload::Saga(
SagaFact::NotProgressing(SagaNotProgressingFactPayload {
saga_id,
saga_state: saga_progress_state(
self.saga_state.ok_or_else(|| {
missing_column(kind, "saga_state")
})?,
)?,
last_event_time: self.last_event_time.ok_or_else(|| {
missing_column(kind, "last_event_time")
})?,
}),
),
FmFactSagaKind::OwnerNotCurrentGeneration => {
FactPayload::Saga(SagaFact::OwnerNotCurrentGeneration(
SagaOwnerNotCurrentFactPayload {
saga_id,
current_sec: self
.current_sec
.ok_or_else(|| missing_column(kind, "current_sec"))?
.into(),
orphan_reason: self
.orphan_reason
.ok_or_else(|| {
missing_column(kind, "orphan_reason")
})?
.into(),
},
))
}
FmFactSagaKind::Abandoned => FactPayload::Saga(
SagaFact::Abandoned(SagaAbandonedFactPayload { saga_id }),
),
};
Ok(fm::case::Fact {
metadata: fm::case::FactMetadata {
id: self.id.into(),
created_sitrep_id: self.created_sitrep_id.into(),
comment: self.comment,
},
payload,
})
}
}

fn missing_column(kind: FmFactSagaKind, column: &str) -> Error {
Error::internal_error(&format!(
"fm_fact_saga row of kind {kind:?} has a NULL {column}, violating the \
CHECK constraint requiring it to be non-NULL for this kind"
))
}
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(270, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(271, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(271, "fm-saga-de"),
KnownVersion::new(270, "fm-alert-resource-deletion"),
KnownVersion::new(269, "fm-disk-de-and-facts"),
KnownVersion::new(268, "fm-sitrep-analysis-report"),
Expand Down
Loading
Loading