diff --git a/nexus/src/db/datastore.rs b/nexus/src/db/datastore.rs index be6c31b053f..8a3f7070c49 100644 --- a/nexus/src/db/datastore.rs +++ b/nexus/src/db/datastore.rs @@ -2326,8 +2326,10 @@ impl DataStore { }) .collect::>(); + // TODO: This should probably be removed when we can switch + // "test-privileged" and "test-unprivileged" to normal silo users rather + // than built-in users. debug!(opctx.log, "creating silo_user entries for built-in users"); - for builtin_user in &builtin_users { self.silo_user_create(SiloUser::new( *SILO_ID, @@ -2335,7 +2337,6 @@ impl DataStore { )) .await?; } - info!(opctx.log, "created silo_user entries for built-in users"); debug!(opctx.log, "attempting to create built-in users"); @@ -2512,23 +2513,27 @@ impl DataStore { }) } + // NOTE: This function is only used for testing and for initial population + // of built-in users as silo users. The error handling here assumes (1) + // that the caller expects no user input error from the database, and (2) + // that if a Silo user with the same id already exists in the database, + // that's not an error (it's assumed to be the same user). pub async fn silo_user_create( &self, silo_user: SiloUser, - ) -> CreateResult { + ) -> Result<(), Error> { use db::schema::silo_user::dsl; - diesel::insert_into(dsl::silo_user) + let _ = diesel::insert_into(dsl::silo_user) .values(silo_user) - .returning(SiloUser::as_returning()) - .get_result_async(self.pool()) + .on_conflict(dsl::id) + .do_nothing() + .execute_async(self.pool()) .await .map_err(|e| { - Error::internal_error(&format!( - "error creating silo user: {:?}", - e - )) - }) + public_error_from_diesel_pool(e, ErrorHandler::Server) + })?; + Ok(()) } /// Load built-in silos into the database @@ -2983,7 +2988,7 @@ mod test { datastore.session_create(&opctx, session.clone()).await.unwrap(); // Associate silo with user - let silo_user = datastore + datastore .silo_user_create(SiloUser::new(*SILO_ID, silo_user_id)) .await .unwrap(); @@ -2993,7 +2998,7 @@ mod test { .fetch() .await .unwrap(); - assert_eq!(silo_user.silo_id, db_silo_user.silo_id,); + assert_eq!(*SILO_ID, db_silo_user.silo_id); // fetch the one we just created let (.., fetched) = LookupPath::new(&opctx, &datastore) @@ -3481,11 +3486,10 @@ mod test { // Create a new Silo user so that we can lookup their keys. let silo_user_id = Uuid::new_v4(); - let silo_user = datastore + datastore .silo_user_create(SiloUser::new(*SILO_ID, silo_user_id)) .await .unwrap(); - assert_eq!(silo_user.id(), silo_user_id); let (.., authz_user) = LookupPath::new(&opctx, &datastore) .silo_user_id(silo_user_id) diff --git a/nexus/src/db/pool.rs b/nexus/src/db/pool.rs index 1a9ddc2bac6..335b5bafb73 100644 --- a/nexus/src/db/pool.rs +++ b/nexus/src/db/pool.rs @@ -53,6 +53,16 @@ impl Pool { Pool { pool } } + pub fn new_failfast(db_config: &DbConfig) -> Self { + let manager = + ConnectionManager::::new(&db_config.url.url()); + let pool = bb8::Builder::new() + .connection_customizer(Box::new(DisallowFullTableScans {})) + .connection_timeout(std::time::Duration::from_millis(1)) + .build_unchecked(manager); + Pool { pool } + } + /// Returns a reference to the underlying pool. pub fn pool(&self) -> &bb8::Pool> { &self.pool diff --git a/nexus/src/nexus.rs b/nexus/src/nexus.rs index b6c4dfb3e35..cfaf16235cc 100644 --- a/nexus/src/nexus.rs +++ b/nexus/src/nexus.rs @@ -113,7 +113,7 @@ pub trait TestInterfaces { &self, silo_id: Uuid, silo_user_id: Uuid, - ) -> CreateResult; + ) -> Result<(), Error>; } pub static BASE_ARTIFACT_DIR: &str = "/var/tmp/oxide_artifacts"; @@ -3980,7 +3980,7 @@ impl TestInterfaces for Nexus { &self, silo_id: Uuid, silo_user_id: Uuid, - ) -> CreateResult { + ) -> Result<(), Error> { let silo_user = SiloUser::new(silo_id, silo_user_id); Ok(self.db_datastore.silo_user_create(silo_user).await?) } diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs index 944e5308162..3d82fd9b27c 100644 --- a/nexus/src/populate.rs +++ b/nexus/src/populate.rs @@ -1,9 +1,53 @@ //! Nexus startup task to load hardcoded data into the database +//! +//! Initial populating of the CockroachDB database happens in two different ways: +//! +//! 1. During "rack setup" (or during `omicron-dev db-run` or test suite +//! initialization), we create the omicron database, schema, and the *bare +//! minimum* data that needs to be there. +//! 2. Every time Nexus starts up, we attempts to insert a bunch of built-in +//! users, roles, role assignments, silo, etc. into the database. We retry +//! this until we successfully inserts everything we expect to or run into +//! some unknown user input problem (e.g., an unexpected conflict, or a SQL +//! syntax error, or something like that). +//! +//! This file implements this second process. +//! +//! As much as possible, data should be inserted using the second process. +//! That's because rack setup only ever happens once, so any data we add that +//! way will never get added to systems that were deployed on a previous version +//! of Omicron. On the other hand, if data is inserted at Nexus startup, then +//! the data will be automatically inserted on upgrade. That's good: that means +//! for the most part, if you want to add a new built-in user, you can just do +//! it and expect it to be there when your code is running. +//! +//! When Nexus starts up and goes to populate data, there are a few cases to +//! consider: +//! +//! * Nexus comes up and CockroachDB is not available. It should retry until +//! CockroachDB is available. +//! * Nexus comes up and none of its data is present. It should go ahead and +//! insert it. +//! * Nexus comes up and some of its data is present. It should still go ahead +//! and insert it, ignoring primary key conflicts (on the assumption that it's +//! the same data). This deals with crashes during a previous attempt, but +//! also the upgrade case mentioned above: future versions of Nexus can +//! deliver more data knowing it will be inserted the first time the new +//! version of Nexus comes online. +//! * Nexus comes up and runs into a non-retryable problem doing any of this +//! (e.g., SQL syntax error). It logs an error. This should eventually raise a +//! support case. It shouldn't ever happen. +//! +//! To help do this correctly, we've defined the [`Populator`] trait. This lets +//! you define a single data-insertion step. We have tests that ensure that +//! each populator behaves as expected in the above ways. use crate::context::OpContext; use crate::db::DataStore; use futures::future::BoxFuture; use futures::FutureExt; +use lazy_static::lazy_static; +use omicron_common::api::external::Error; use omicron_common::backoff; use std::sync::Arc; @@ -37,37 +81,12 @@ async fn populate( opctx: &OpContext, datastore: &DataStore, ) -> Result<(), String> { - use omicron_common::api::external::Error; - struct Populator<'a> { - name: &'static str, - func: &'a (dyn Fn() -> BoxFuture<'a, Result<(), Error>> + Send + Sync), - } - - let populate_users = || { - async { datastore.load_builtin_users(opctx).await.map(|_| ()) }.boxed() - }; - let populate_roles = || { - async { datastore.load_builtin_roles(opctx).await.map(|_| ()) }.boxed() - }; - let populate_role_asgns = || { - async { datastore.load_builtin_role_asgns(opctx).await.map(|_| ()) } - .boxed() - }; - let populate_silos = || { - async { datastore.load_builtin_silos(opctx).await.map(|_| ()) }.boxed() - }; - let populators = [ - Populator { name: "users", func: &populate_users }, - Populator { name: "roles", func: &populate_roles }, - Populator { name: "role assignments", func: &populate_role_asgns }, - Populator { name: "silos", func: &populate_silos }, - ]; - - for p in populators { + for p in *ALL_POPULATORS { let db_result = backoff::retry_notify( backoff::internal_service_policy(), || async { - (p.func)().await.map_err(|error| match &error { + p.populate(opctx, datastore).await.map_err(|error| match &error + { Error::ServiceUnavailable { .. } => { backoff::BackoffError::transient(error) } @@ -77,8 +96,8 @@ async fn populate( |error, delay| { warn!( opctx.log, - "failed to populate built-in {}; will retry in {:?}", - p.name, + "failed to populate built-in {:?}; will retry in {:?}", + p, delay; "error_message" => ?error, ); @@ -91,7 +110,7 @@ async fn populate( // some other red flag that something is wrong. (This should be // unlikely in practice.) error!(opctx.log, - "gave up trying to populate built-in {}", p.name; + "gave up trying to populate built-in {:?}", p; "error_message" => ?error ); } @@ -101,3 +120,189 @@ async fn populate( Ok(()) } + +/// Each Populator is a simple call into the datastore to load a chunk of +/// built-in (fixed) data that's shipped with Nexus +/// +/// Each Populator should only do one thing. +trait Populator: std::fmt::Debug + Send + Sync { + fn populate<'a, 'b>( + &self, + opctx: &'a OpContext, + datastore: &'a DataStore, + ) -> BoxFuture<'b, Result<(), Error>> + where + 'a: 'b; +} + +/// Populates the built-in users +#[derive(Debug)] +struct PopulateBuiltinUsers; +impl Populator for PopulateBuiltinUsers { + fn populate<'a, 'b>( + &self, + opctx: &'a OpContext, + datastore: &'a DataStore, + ) -> BoxFuture<'b, Result<(), Error>> + where + 'a: 'b, + { + async { datastore.load_builtin_users(opctx).await.map(|_| ()) }.boxed() + } +} + +/// Populates the built-in roles +#[derive(Debug)] +struct PopulateBuiltinRoles; +impl Populator for PopulateBuiltinRoles { + fn populate<'a, 'b>( + &self, + opctx: &'a OpContext, + datastore: &'a DataStore, + ) -> BoxFuture<'b, Result<(), Error>> + where + 'a: 'b, + { + async { datastore.load_builtin_roles(opctx).await.map(|_| ()) }.boxed() + } +} + +/// Populates the built-in role assignments +#[derive(Debug)] +struct PopulateBuiltinRoleAssignments; +impl Populator for PopulateBuiltinRoleAssignments { + fn populate<'a, 'b>( + &self, + opctx: &'a OpContext, + datastore: &'a DataStore, + ) -> BoxFuture<'b, Result<(), Error>> + where + 'a: 'b, + { + async { datastore.load_builtin_role_asgns(opctx).await.map(|_| ()) } + .boxed() + } +} + +/// Populates the built-in silo +#[derive(Debug)] +struct PopulateBuiltinSilos; +impl Populator for PopulateBuiltinSilos { + fn populate<'a, 'b>( + &self, + opctx: &'a OpContext, + datastore: &'a DataStore, + ) -> BoxFuture<'b, Result<(), Error>> + where + 'a: 'b, + { + async { datastore.load_builtin_silos(opctx).await.map(|_| ()) }.boxed() + } +} + +lazy_static! { + static ref ALL_POPULATORS: [&'static dyn Populator; 4] = [ + &PopulateBuiltinUsers, + &PopulateBuiltinRoles, + &PopulateBuiltinRoleAssignments, + &PopulateBuiltinSilos, + ]; +} + +#[cfg(test)] +mod test { + use super::Populator; + use super::ALL_POPULATORS; + use crate::authn; + use crate::authz; + use crate::context::OpContext; + use crate::db; + use anyhow::Context; + use nexus_test_utils::db::test_setup_database; + use omicron_common::api::external::Error; + use omicron_test_utils::dev; + use std::sync::Arc; + + #[tokio::test] + async fn test_populators() { + for p in *ALL_POPULATORS { + do_test_populator_idempotent(p).await; + } + } + + async fn do_test_populator_idempotent(p: &dyn Populator) { + let logctx = dev::test_setup_log("test_populator"); + let mut db = test_setup_database(&logctx.log).await; + let cfg = db::Config { url: db.pg_config().clone() }; + let pool = Arc::new(db::Pool::new(&cfg)); + let datastore = Arc::new(db::DataStore::new(pool)); + let opctx = OpContext::for_background( + logctx.log.clone(), + Arc::new(authz::Authz::new(&logctx.log)), + authn::Context::internal_db_init(), + Arc::clone(&datastore), + ); + let log = &logctx.log; + + // Running each populator once under normal conditions should work. + info!(&log, "populator {:?}, run 1", p); + p.populate(&opctx, &datastore) + .await + .with_context(|| format!("populator {:?} (try 1)", p)) + .unwrap(); + + // It should also work fine to run it again. + info!(&log, "populator {:?}, run 2 (idempotency check)", p); + p.populate(&opctx, &datastore) + .await + .with_context(|| { + format!( + "populator {:?}: expected to be idempotent, but it failed \ + when run a second time", + p + ) + }) + .unwrap(); + + info!(&log, "cleaning up database"); + db.cleanup().await.unwrap(); + + // Test again with the database offline. In principle we could do this + // immediately without creating a new pool and datastore. However, the + // pool's default behavior is to wait 30 seconds for a connection, which + // makes this test take a long time. (See the note in + // nexus/src/db/pool.rs about this.) So let's create a pool with an + // arbitrarily short timeout now. (We wouldn't want to do this above + // because we do want to wait a bit when we expect things to work, in + // case the test system is busy.) + // + // Anyway, if we try again with a broken database, we should get a + // ServiceUnavailable error, which indicates a transient failure. + let pool = Arc::new(db::Pool::new_failfast(&cfg)); + let datastore = Arc::new(db::DataStore::new(pool)); + let opctx = OpContext::for_background( + logctx.log.clone(), + Arc::new(authz::Authz::new(&logctx.log)), + authn::Context::internal_db_init(), + Arc::clone(&datastore), + ); + + info!(&log, "populator {:?}, with database offline", p); + match p.populate(&opctx, &datastore).await { + Err(Error::ServiceUnavailable { .. }) => (), + Ok(_) => panic!( + "populator {:?}: unexpectedly succeeded with no database", + p + ), + Err(error) => panic!( + "populator {:?}: expected ServiceUnavailable when the database \ + was down, but got {:#} ({:?})", + p, + error, + error + ), + }; + info!(&log, "populator {:?} done", p); + logctx.cleanup_successful(); + } +} diff --git a/nexus/tests/integration_tests/silos.rs b/nexus/tests/integration_tests/silos.rs index b704b6d0c68..bad2be1294a 100644 --- a/nexus/tests/integration_tests/silos.rs +++ b/nexus/tests/integration_tests/silos.rs @@ -2,8 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use uuid::Uuid; - use nexus_test_utils::http_testing::{AuthnMode, NexusRequest}; use omicron_nexus::external_api::views::{Organization, Silo}; use omicron_nexus::TestInterfaces as _; @@ -16,7 +14,6 @@ use nexus_test_utils::resource_helpers::{ use nexus_test_utils::ControlPlaneTestContext; use nexus_test_utils_macros::nexus_test; -use omicron_nexus::db::identity::Asset; #[nexus_test] async fn test_silos(cptestctx: &ControlPlaneTestContext) { @@ -67,10 +64,12 @@ async fn test_silos(cptestctx: &ControlPlaneTestContext) { assert_eq!(silos[0].identity.name, "discoverable"); // Create a new user in the discoverable silo - let new_silo_user = nexus + let new_silo_user_id = + "6922f0b2-9a92-659b-da6b-93ad4955a3a3".parse().unwrap(); + nexus .silo_user_create( silos[0].identity.id, /* silo id */ - Uuid::new_v4(), /* silo user id */ + new_silo_user_id, ) .await .unwrap(); @@ -134,7 +133,7 @@ async fn test_silos(cptestctx: &ControlPlaneTestContext) { // Verify silo user was also deleted nexus - .silo_user_fetch(authn_opctx, new_silo_user.id()) + .silo_user_fetch(authn_opctx, new_silo_user_id) .await .expect_err("unexpected success"); }