From 8487e80691a13b7030cc7d3f1b6411fc8de77d2c Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 27 Aug 2019 21:29:11 +0200 Subject: [PATCH 01/32] Add dump_db task. --- src/bin/enqueue-job.rs | 19 ++-- src/tasks.rs | 2 + src/tasks/dump-db.toml | 177 ++++++++++++++++++++++++++++++ src/tasks/dump_db.rs | 244 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 433 insertions(+), 9 deletions(-) create mode 100644 src/tasks/dump-db.toml create mode 100644 src/tasks/dump_db.rs diff --git a/src/bin/enqueue-job.rs b/src/bin/enqueue-job.rs index 5a3494acdf0..b90aaf910ae 100644 --- a/src/bin/enqueue-job.rs +++ b/src/bin/enqueue-job.rs @@ -1,17 +1,18 @@ -use cargo_registry::util::{CargoError, CargoResult}; +use cargo_registry::util::{human, CargoError, CargoResult}; use cargo_registry::{db, tasks}; use std::env::args; use swirl::Job; fn main() -> CargoResult<()> { - let conn = db::connect_now()?; - match &*args().nth(1).unwrap_or_default() { - "update_downloads" => tasks::update_downloads() - .enqueue(&conn) - .map_err(|e| CargoError::from_std_error(e))?, - other => panic!("Unrecognized job type `{}`", other), - }; + "update_downloads" => enqueue(tasks::update_downloads()), + "dump_db" => enqueue(tasks::dump_db()), + other => Err(human(&format!("Unrecognized job type `{}`", other))), + } +} - Ok(()) +fn enqueue(job: J) -> CargoResult<()> { + let conn = db::connect_now()?; + job.enqueue(&conn) + .map_err(|e| CargoError::from_std_error(e)) } diff --git a/src/tasks.rs b/src/tasks.rs index 930f83bfc11..d4f2f3e3203 100644 --- a/src/tasks.rs +++ b/src/tasks.rs @@ -1,3 +1,5 @@ +mod dump_db; mod update_downloads; +pub use dump_db::dump_db; pub use update_downloads::update_downloads; diff --git a/src/tasks/dump-db.toml b/src/tasks/dump-db.toml new file mode 100644 index 00000000000..f246a11a3e2 --- /dev/null +++ b/src/tasks/dump-db.toml @@ -0,0 +1,177 @@ +# Column visibility in database dumps. Only public columns are included in the dump. +[api_tokens.columns] +id = "private" +user_id = "private" +token = "private" +name = "private" +created_at = "private" +last_used_at = "private" +revoked = "private" + +[background_jobs.columns] +id = "private" +job_type = "private" +data = "private" +retries = "private" +last_retry = "private" +created_at = "private" + +[badges.columns] +crate_id = "public" +badge_type = "public" +attributes = "public" + +[categories.columns] +id = "public" +category = "public" +slug = "public" +description = "public" +crates_cnt = "public" +created_at = "public" +path = "public" + +[crate_owner_invitations.columns] +invited_user_id = "private" +invited_by_user_id = "private" +crate_id = "private" +created_at = "private" + +[crate_owners] +filter = "NOT deleted" +[crate_owners.columns] +crate_id = "public" +owner_id = "public" +created_at = "public" +created_by = "private" +deleted = "private" +updated_at = "public" +owner_kind = "public" + +[crates.columns] +id = "public" +name = "public" +updated_at = "public" +created_at = "public" +downloads = "public" +description = "public" +homepage = "public" +documentation = "public" +readme = "public" +textsearchable_index_col = "public" +license = "public" +repository = "public" +max_upload_size = "public" + +[crates_categories.columns] +crate_id = "public" +category_id = "public" + +[crates_keywords.columns] +crate_id = "public" +keyword_id = "public" + +[dependencies.columns] +id = "public" +version_id = "public" +crate_id = "public" +req = "public" +optional = "public" +default_features = "public" +features = "public" +target = "public" +kind = "public" + +[__diesel_schema_migrations.columns] +version = "private" +run_on = "private" + +[emails.columns] +id = "private" +user_id = "private" +email = "private" +verified = "private" +token = "private" +token_generated_at = "private" + +[follows.columns] +user_id = "private" +crate_id = "private" + +[keywords.columns] +id = "public" +keyword = "public" +crates_cnt = "public" +created_at = "public" + +[metadata.columns] +total_downloads = "public" + +[publish_limit_buckets.columns] +user_id = "private" +tokens = "private" +last_refill = "private" + +[publish_rate_overrides.columns] +user_id = "private" +burst = "private" + +[readme_renderings.columns] +version_id = "public" +rendered_at = "public" + +[reserved_crate_names.columns] +name = "public" + +[teams.columns] +id = "public" +login = "public" +github_id = "public" +name = "public" +avatar = "public" + +[users] +filter = """ +id in ( + SELECT owner_id AS user_id FROM crate_owners WHERE NOT deleted AND owner_kind = 0 + UNION + SELECT published_by as user_id FROM versions +) +""" +[users.columns] +id = "public" +email = "private" +gh_access_token = "private" +gh_login = "public" +name = "public" +gh_avatar = "public" +gh_id = "public" + +[version_authors.columns] +id = "public" +version_id = "public" +user_id = "private" +name = "public" + +[version_downloads.columns] +version_id = "public" +downloads = "public" +counted = "private" +date = "public" +processed = "private" + +[versions.columns] +id = "public" +crate_id = "public" +num = "public" +updated_at = "public" +created_at = "public" +downloads = "public" +features = "public" +yanked = "public" +license = "public" +crate_size = "public" +published_by = "public" + +[versions_published_by.columns] +version_id = "private" +email = "private" diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs new file mode 100644 index 00000000000..0f04b492058 --- /dev/null +++ b/src/tasks/dump_db.rs @@ -0,0 +1,244 @@ +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, +}; + +use crate::{ + background_jobs::Environment, env, uploaders::Uploader, util::errors::std_error_no_send, +}; + +use swirl::PerformError; + +/// Create CSV dumps of the public information in the database, wrap them in a +/// tarball and upload to S3. +#[swirl::background_job] +pub fn dump_db(env: &Environment) -> Result<(), PerformError> { + // TODO make path configurable + const EXPORT_DIR_TEMPLATE: &str = "/tmp/dump-db/%Y-%m-%d-%H%M%S"; + let export_dir = PathBuf::from(chrono::Utc::now().format(EXPORT_DIR_TEMPLATE).to_string()); + std::fs::create_dir_all(&export_dir)?; + run_psql(&export_dir)?; + let tarball = create_tarball(&export_dir)?; + upload_tarball(&tarball, &env.uploader)?; + // TODO: more robust cleanup + std::fs::remove_dir_all(&export_dir)?; + std::fs::remove_file(&tarball)?; + Ok(()) +} + +/// An enum indicating whether a column is included in the database dumps. +/// Public columns are included, private are not. +#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] +#[serde(rename_all = "lowercase")] +enum ColumnVisibility { + Private, + Public, +} + +#[derive(Clone, Debug, Deserialize)] +struct TableConfig { + filter: Option, + columns: BTreeMap, +} + +/// Maps table names to the respective configurations +type VisibilityConfig = BTreeMap; + +fn load_visibility_config() -> VisibilityConfig { + toml::from_str(include_str!("dump-db.toml")).unwrap() +} + +impl TableConfig { + fn columns_str(&self) -> String { + self.columns + .iter() + .filter(|&(_, &vis)| vis == ColumnVisibility::Public) + .map(|(col, _)| format!("\"{}\"", col)) + .collect::>() + .join(", ") + } + + fn view_sql(&self, table: &str) -> String { + self.filter + .as_ref() + .map(|filter| { + format!( + r#" + CREATE TEMPORARY VIEW "dump_db_{table}" AS ( + SELECT {columns} + FROM "{table}" + WHERE {filter} + ); + "#, + table = table, + columns = self.columns_str(), + filter = filter, + ) + }) + .unwrap_or_default() + } + + fn copy_sql(&self, table: &str) -> String { + if self.filter.is_some() { + format!( + r#" + \copy (SELECT * FROM "dump_db_{table}") TO '{table}.csv' WITH CSV HEADER + "#, + table = table, + ) + } else { + let cols_str = self.columns_str(); + if cols_str.is_empty() { + String::new() + } else { + format!( + r#" + \copy "{table}" ({columns}) TO '{table}.csv' WITH CSV HEADER + "#, + table = table, + columns = cols_str, + ) + } + } + } +} + +fn gen_psql_script() -> String { + let config = load_visibility_config(); + let view_sql = config + .iter() + .map(|(table, config)| config.view_sql(table)) + .collect::>() + .concat(); + let copy_sql = config + .iter() + .map(|(table, config)| config.copy_sql(table)) + .collect::>() + .concat(); + format!( + r#" + BEGIN; + {view_sql} + COMMIT; + BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + {copy_sql} + COMMIT; + "#, + view_sql = view_sql, + copy_sql = copy_sql, + ) +} + +fn run_psql(export_dir: &Path) -> Result<(), PerformError> { + use std::io::prelude::*; + use std::process::{Command, Stdio}; + + let psql_script = gen_psql_script(); + // TODO Redirect stdout and stderr to avoid polluting the worker logs. + let mut psql = Command::new("psql") + .arg(env("DATABASE_URL")) + .current_dir(export_dir) + .stdin(Stdio::piped()) + .spawn()?; + let mut stdin = psql.stdin.take().unwrap(); + stdin.write_all(psql_script.as_bytes())?; + drop(stdin); + psql.wait()?; + Ok(()) +} + +fn create_tarball(export_dir: &Path) -> Result { + let tarball_name = export_dir.with_extension("tar.gz"); + let tarball = std::fs::File::create(&tarball_name)?; + let encoder = flate2::write::GzEncoder::new(tarball, flate2::Compression::default()); + let mut archive = tar::Builder::new(encoder); + archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?; + Ok(tarball_name) +} + +fn upload_tarball(tarball: &Path, uploader: &Uploader) -> Result<(), PerformError> { + use std::io::Read; + + let client = reqwest::Client::new(); + let target_name = format!("dumps/{}", tarball.file_name().unwrap().to_str().unwrap()); + let mut buf = vec![]; + // TODO: find solution that does not require holding the whole database + // export in memory at once. + std::fs::File::open(tarball)?.read_to_end(&mut buf)?; + uploader + .upload(&client, &target_name, buf, "application/gzip") + .map_err(std_error_no_send)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::pg_connection; + use diesel::prelude::*; + use std::collections::HashSet; + + /// Test whether the schema in the visibility configuration matches the test database. + #[test] + fn check_visibility_config() { + let conn = pg_connection(); + let db_columns: HashSet<_> = get_db_columns(&conn) + .into_iter() + .map(|c| (c.table_name, c.column_name)) + .collect(); + let vis_columns: HashSet<_> = load_visibility_config() + .iter() + .flat_map(|(table, config)| { + config + .columns + .iter() + .map(move |(column, _)| (table.clone(), column.clone())) + }) + .collect(); + let mut errors = vec![]; + for (table, col) in db_columns.difference(&vis_columns) { + errors.push(format!( + "No visibility information for columns {}.{}.", + table, col + )); + } + for (table, col) in vis_columns.difference(&db_columns) { + errors.push(format!( + "Column {}.{} does not exist in the database.", + table, col + )); + } + assert!( + errors.is_empty(), + "The visibility configuration does not match the database schema:\n{}", + errors.join("\n - "), + ); + } + + mod information_schema { + table! { + information_schema.columns (table_schema, table_name, column_name) { + table_schema -> Text, + table_name -> Text, + column_name -> Text, + ordinal_position -> Integer, + } + } + } + + #[derive(Debug, PartialEq, Queryable)] + struct Column { + table_name: String, + column_name: String, + } + + fn get_db_columns(conn: &PgConnection) -> Vec { + use information_schema::columns::dsl::*; + columns + .select((table_name, column_name)) + .filter(table_schema.eq("public")) + .order_by((table_name, ordinal_position)) + .load(conn) + .unwrap() + } +} From 32ec9440d7e1f13b0186662e66668f5d513955b1 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 17 Aug 2019 21:39:38 +0200 Subject: [PATCH 02/32] Add documentation to dump-db.toml. --- src/tasks/dump-db.toml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/tasks/dump-db.toml b/src/tasks/dump-db.toml index f246a11a3e2..60fd7de7b94 100644 --- a/src/tasks/dump-db.toml +++ b/src/tasks/dump-db.toml @@ -1,4 +1,13 @@ -# Column visibility in database dumps. Only public columns are included in the dump. +# This file configures what to include in public database dumps. For each +# database table, we set which columns are included in the dump, and optionally +# how to filter the rows. +# +# .columns - a TOML dictionary determining what columns to include. +# possible values are "private" (not included) and "public" (included). +# +# .filter - a string that is a valid SQL expression, which is used +# in a WHERE clause to filter the rows of the table. + [api_tokens.columns] id = "private" user_id = "private" From 4bfd40c7a01f6eeb9e7afe7217284c866b3079ad Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 17 Aug 2019 21:55:44 +0200 Subject: [PATCH 03/32] Refactor enqueue helper in the enqueue-job binary. --- src/bin/enqueue-job.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/bin/enqueue-job.rs b/src/bin/enqueue-job.rs index b90aaf910ae..43888a5880f 100644 --- a/src/bin/enqueue-job.rs +++ b/src/bin/enqueue-job.rs @@ -1,18 +1,23 @@ use cargo_registry::util::{human, CargoError, CargoResult}; use cargo_registry::{db, tasks}; +use diesel::PgConnection; use std::env::args; -use swirl::Job; fn main() -> CargoResult<()> { + let conn = db::connect_now()?; match &*args().nth(1).unwrap_or_default() { - "update_downloads" => enqueue(tasks::update_downloads()), - "dump_db" => enqueue(tasks::dump_db()), + "update_downloads" => tasks::update_downloads().enqueue(&conn), + "dump_db" => tasks::dump_db().enqueue(&conn), other => Err(human(&format!("Unrecognized job type `{}`", other))), } } -fn enqueue(job: J) -> CargoResult<()> { - let conn = db::connect_now()?; - job.enqueue(&conn) - .map_err(|e| CargoError::from_std_error(e)) +/// Helper to map the `PerformError` returned by `swirl::Job::enqueue()` to a +/// `CargoError`. Can be removed once `map_err()` isn't needed any more. +trait Enqueue: swirl::Job { + fn enqueue(self, conn: &PgConnection) -> CargoResult<()> { + ::enqueue(self, conn).map_err(|e| CargoError::from_std_error(e)) + } } + +impl Enqueue for J {} From 1d500844c21e94eeb837f291982da345d9f57a0f Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 17 Aug 2019 22:46:21 +0200 Subject: [PATCH 04/32] Reduce global state in dump_db task. --- src/tasks/dump_db.rs | 102 +++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 0f04b492058..92eebe1a5c3 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -3,9 +3,7 @@ use std::{ path::{Path, PathBuf}, }; -use crate::{ - background_jobs::Environment, env, uploaders::Uploader, util::errors::std_error_no_send, -}; +use crate::{background_jobs::Environment, uploaders::Uploader, util::errors::std_error_no_send}; use swirl::PerformError; @@ -17,9 +15,16 @@ pub fn dump_db(env: &Environment) -> Result<(), PerformError> { const EXPORT_DIR_TEMPLATE: &str = "/tmp/dump-db/%Y-%m-%d-%H%M%S"; let export_dir = PathBuf::from(chrono::Utc::now().format(EXPORT_DIR_TEMPLATE).to_string()); std::fs::create_dir_all(&export_dir)?; - run_psql(&export_dir)?; + let visibility_config = toml::from_str(include_str!("dump-db.toml")).unwrap(); + let database_url = if cfg!(test) { + crate::env("TEST_DATABASE_URL") + } else { + crate::env("DATABASE_URL") + }; + run_psql(&visibility_config, &database_url, &export_dir)?; let tarball = create_tarball(&export_dir)?; - upload_tarball(&tarball, &env.uploader)?; + let target_name = format!("dumps/{}", tarball.file_name().unwrap().to_str().unwrap()); + upload_tarball(&tarball, &target_name, &env.uploader)?; // TODO: more robust cleanup std::fs::remove_dir_all(&export_dir)?; std::fs::remove_file(&tarball)?; @@ -41,13 +46,6 @@ struct TableConfig { columns: BTreeMap, } -/// Maps table names to the respective configurations -type VisibilityConfig = BTreeMap; - -fn load_visibility_config() -> VisibilityConfig { - toml::from_str(include_str!("dump-db.toml")).unwrap() -} - impl TableConfig { fn columns_str(&self) -> String { self.columns @@ -103,40 +101,52 @@ impl TableConfig { } } -fn gen_psql_script() -> String { - let config = load_visibility_config(); - let view_sql = config - .iter() - .map(|(table, config)| config.view_sql(table)) - .collect::>() - .concat(); - let copy_sql = config - .iter() - .map(|(table, config)| config.copy_sql(table)) - .collect::>() - .concat(); - format!( - r#" - BEGIN; - {view_sql} - COMMIT; - BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; - {copy_sql} - COMMIT; - "#, - view_sql = view_sql, - copy_sql = copy_sql, - ) +/// Maps table names to the respective configurations +#[derive(Clone, Debug, Deserialize)] +#[serde(transparent)] +struct VisibilityConfig(BTreeMap); + +impl VisibilityConfig { + fn gen_psql_script(&self) -> String { + let view_sql = self + .0 + .iter() + .map(|(table, config)| config.view_sql(table)) + .collect::>() + .concat(); + let copy_sql = self + .0 + .iter() + .map(|(table, config)| config.copy_sql(table)) + .collect::>() + .concat(); + format!( + r#" + BEGIN; + {view_sql} + COMMIT; + BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + {copy_sql} + COMMIT; + "#, + view_sql = view_sql, + copy_sql = copy_sql, + ) + } } -fn run_psql(export_dir: &Path) -> Result<(), PerformError> { +fn run_psql( + config: &VisibilityConfig, + database_url: &str, + export_dir: &Path, +) -> Result<(), PerformError> { use std::io::prelude::*; use std::process::{Command, Stdio}; - let psql_script = gen_psql_script(); + let psql_script = config.gen_psql_script(); // TODO Redirect stdout and stderr to avoid polluting the worker logs. let mut psql = Command::new("psql") - .arg(env("DATABASE_URL")) + .arg(database_url) .current_dir(export_dir) .stdin(Stdio::piped()) .spawn()?; @@ -156,17 +166,20 @@ fn create_tarball(export_dir: &Path) -> Result { Ok(tarball_name) } -fn upload_tarball(tarball: &Path, uploader: &Uploader) -> Result<(), PerformError> { +fn upload_tarball( + tarball: &Path, + target_name: &str, + uploader: &Uploader, +) -> Result<(), PerformError> { use std::io::Read; let client = reqwest::Client::new(); - let target_name = format!("dumps/{}", tarball.file_name().unwrap().to_str().unwrap()); let mut buf = vec![]; // TODO: find solution that does not require holding the whole database // export in memory at once. std::fs::File::open(tarball)?.read_to_end(&mut buf)?; uploader - .upload(&client, &target_name, buf, "application/gzip") + .upload(&client, target_name, buf, "application/gzip") .map_err(std_error_no_send)?; Ok(()) } @@ -186,7 +199,10 @@ mod tests { .into_iter() .map(|c| (c.table_name, c.column_name)) .collect(); - let vis_columns: HashSet<_> = load_visibility_config() + let visibility_config: VisibilityConfig = + toml::from_str(include_str!("dump-db.toml")).unwrap(); + let vis_columns: HashSet<_> = visibility_config + .0 .iter() .flat_map(|(table, config)| { config From 0ed2929de02e7795c724e6223b9bd9e2679bfe73 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 22 Aug 2019 00:30:24 +0200 Subject: [PATCH 05/32] Address review comments by @carols10cents. --- src/tasks/dump-db.toml | 4 ++-- src/tasks/dump_db.rs | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/tasks/dump-db.toml b/src/tasks/dump-db.toml index 60fd7de7b94..6465d53bb99 100644 --- a/src/tasks/dump-db.toml +++ b/src/tasks/dump-db.toml @@ -125,8 +125,8 @@ user_id = "private" burst = "private" [readme_renderings.columns] -version_id = "public" -rendered_at = "public" +version_id = "private" +rendered_at = "private" [reserved_crate_names.columns] name = "public" diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 92eebe1a5c3..e3b0a062bd8 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -40,6 +40,9 @@ enum ColumnVisibility { Public, } +/// Filtering information for a single table. The `filter` field is a valid SQL +/// expression used in a `WHERE` clause to filter the rows of the table. The +/// `columns` field maps column names to their respective visibilities. #[derive(Clone, Debug, Deserialize)] struct TableConfig { filter: Option, @@ -101,7 +104,7 @@ impl TableConfig { } } -/// Maps table names to the respective configurations +/// Maps table names to the respective configurations. Used to load `dump_db.toml`. #[derive(Clone, Debug, Deserialize)] #[serde(transparent)] struct VisibilityConfig(BTreeMap); From c2dce7c37d9bb5eeca09e774defb19c0677082be Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 24 Aug 2019 22:17:58 +0200 Subject: [PATCH 06/32] Make database URL and upload target of DB dumps configurable. --- src/bin/enqueue-job.rs | 14 ++++++++++---- src/tasks/dump_db.rs | 12 +++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/bin/enqueue-job.rs b/src/bin/enqueue-job.rs index 43888a5880f..290219421a8 100644 --- a/src/bin/enqueue-job.rs +++ b/src/bin/enqueue-job.rs @@ -1,13 +1,19 @@ use cargo_registry::util::{human, CargoError, CargoResult}; -use cargo_registry::{db, tasks}; +use cargo_registry::{db, env, tasks}; use diesel::PgConnection; -use std::env::args; fn main() -> CargoResult<()> { let conn = db::connect_now()?; - match &*args().nth(1).unwrap_or_default() { + let mut args = std::env::args().skip(1); + match &*args.next().unwrap_or_default() { "update_downloads" => tasks::update_downloads().enqueue(&conn), - "dump_db" => tasks::dump_db().enqueue(&conn), + "dump_db" => { + let database_url = args.next().unwrap_or_else(|| env("DATABASE_URL")); + let target_name = args + .next() + .unwrap_or_else(|| String::from("db-dump.tar.gz")); + tasks::dump_db(database_url, target_name).enqueue(&conn) + } other => Err(human(&format!("Unrecognized job type `{}`", other))), } } diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index e3b0a062bd8..a202ef45abc 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -10,20 +10,18 @@ use swirl::PerformError; /// Create CSV dumps of the public information in the database, wrap them in a /// tarball and upload to S3. #[swirl::background_job] -pub fn dump_db(env: &Environment) -> Result<(), PerformError> { +pub fn dump_db( + env: &Environment, + database_url: String, + target_name: String, +) -> Result<(), PerformError> { // TODO make path configurable const EXPORT_DIR_TEMPLATE: &str = "/tmp/dump-db/%Y-%m-%d-%H%M%S"; let export_dir = PathBuf::from(chrono::Utc::now().format(EXPORT_DIR_TEMPLATE).to_string()); std::fs::create_dir_all(&export_dir)?; let visibility_config = toml::from_str(include_str!("dump-db.toml")).unwrap(); - let database_url = if cfg!(test) { - crate::env("TEST_DATABASE_URL") - } else { - crate::env("DATABASE_URL") - }; run_psql(&visibility_config, &database_url, &export_dir)?; let tarball = create_tarball(&export_dir)?; - let target_name = format!("dumps/{}", tarball.file_name().unwrap().to_str().unwrap()); upload_tarball(&tarball, &target_name, &env.uploader)?; // TODO: more robust cleanup std::fs::remove_dir_all(&export_dir)?; From a648dbd34f974bdf8df1f94b73903886621ba7e6 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Mon, 26 Aug 2019 21:15:55 +0200 Subject: [PATCH 07/32] Make cleanup of old database files more robust. --- Cargo.lock | 1 + Cargo.toml | 1 + src/tasks/dump_db.rs | 16 ++++++++++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f6c10918f40..5f45ab931c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -217,6 +217,7 @@ dependencies = [ "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "reqwest 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)", "scheduled-thread-pool 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "semver 0.9.0 (git+https://github.com/steveklabnik/semver.git)", "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 0c540febc50..47c75e20f6b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,6 +83,7 @@ tokio = "0.1" hyper = "0.12" ctrlc = { version = "3.0", features = ["termination"] } indexmap = "1.0.2" +scopeguard = "0.3.3" [dev-dependencies] conduit-test = "0.8" diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index a202ef45abc..5bed4a87aaa 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -5,6 +5,7 @@ use std::{ use crate::{background_jobs::Environment, uploaders::Uploader, util::errors::std_error_no_send}; +use scopeguard::defer; use swirl::PerformError; /// Create CSV dumps of the public information in the database, wrap them in a @@ -15,17 +16,20 @@ pub fn dump_db( database_url: String, target_name: String, ) -> Result<(), PerformError> { - // TODO make path configurable - const EXPORT_DIR_TEMPLATE: &str = "/tmp/dump-db/%Y-%m-%d-%H%M%S"; - let export_dir = PathBuf::from(chrono::Utc::now().format(EXPORT_DIR_TEMPLATE).to_string()); + let timestamp = chrono::Utc::now().format("%Y-%m-%d-%H%M%S").to_string(); + let export_dir = std::env::temp_dir().join("dump-db").join(timestamp); std::fs::create_dir_all(&export_dir)?; + defer! {{ + std::fs::remove_dir_all(&export_dir).unwrap(); + }} let visibility_config = toml::from_str(include_str!("dump-db.toml")).unwrap(); run_psql(&visibility_config, &database_url, &export_dir)?; let tarball = create_tarball(&export_dir)?; + defer! {{ + std::fs::remove_file(&tarball).unwrap(); + }} upload_tarball(&tarball, &target_name, &env.uploader)?; - // TODO: more robust cleanup - std::fs::remove_dir_all(&export_dir)?; - std::fs::remove_file(&tarball)?; + println!("Database dump uploaded to {}.", &target_name); Ok(()) } From 29a86e9537147ce1d0af4aa09a47be84a519c9dc Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 27 Aug 2019 21:35:28 +0200 Subject: [PATCH 08/32] Adapt dump_db to new uploader interface. --- src/tasks/dump_db.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 5bed4a87aaa..fa54e1264ab 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -176,15 +176,18 @@ fn upload_tarball( target_name: &str, uploader: &Uploader, ) -> Result<(), PerformError> { - use std::io::Read; - let client = reqwest::Client::new(); - let mut buf = vec![]; - // TODO: find solution that does not require holding the whole database - // export in memory at once. - std::fs::File::open(tarball)?.read_to_end(&mut buf)?; + let tarfile = std::fs::File::open(tarball)?; + let content_length = tarfile.metadata()?.len(); + // TODO Figure out the correct content type. uploader - .upload(&client, target_name, buf, "application/gzip") + .upload( + &client, + target_name, + tarfile, + content_length, + "application/gzip", + ) .map_err(std_error_no_send)?; Ok(()) } From ec06f101cdc55412bd35e9b204140356f513e18a Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Mon, 26 Aug 2019 21:26:03 +0200 Subject: [PATCH 09/32] Correct error handling for the psql call. --- src/tasks/dump_db.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index fa54e1264ab..c5d2ffa8568 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -149,16 +149,29 @@ fn run_psql( use std::process::{Command, Stdio}; let psql_script = config.gen_psql_script(); - // TODO Redirect stdout and stderr to avoid polluting the worker logs. let mut psql = Command::new("psql") .arg(database_url) .current_dir(export_dir) .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()) .spawn()?; let mut stdin = psql.stdin.take().unwrap(); - stdin.write_all(psql_script.as_bytes())?; - drop(stdin); - psql.wait()?; + let input_thread = std::thread::spawn(move || -> std::io::Result<()> { + stdin.write_all(psql_script.as_bytes())?; + Ok(()) + }); + let output = psql.wait_with_output()?; + input_thread.join().unwrap()?; + if !output.stderr.is_empty() { + Err(format!( + "Error while executing psql: {}", + String::from_utf8_lossy(&output.stderr) + ))?; + } + if !output.status.success() { + Err("psql did not finish successfully.")?; + } Ok(()) } From d4b9f8cf36be72152a30c8cb8b2b7d32a639adb9 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 29 Aug 2019 09:11:00 +0200 Subject: [PATCH 10/32] Use Handlebars template to generate psql script. --- Cargo.lock | 57 +++++++++++++++++++++++ Cargo.toml | 1 + src/tasks/dump-db.toml | 3 +- src/tasks/dump-export.hbs | 21 +++++++++ src/tasks/dump_db.rs | 97 +++++++++++++-------------------------- 5 files changed, 111 insertions(+), 68 deletions(-) create mode 100644 src/tasks/dump-export.hbs diff --git a/Cargo.lock b/Cargo.lock index 5f45ab931c3..bcf03930d65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,6 +199,7 @@ dependencies = [ "flate2 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", "git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "hyper 0.12.25 (registry+https://github.com/rust-lang/crates.io-index)", @@ -923,6 +924,31 @@ dependencies = [ "tokio-io 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "handlebars" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "pest 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_derive 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "hashbrown" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "hex" version = "0.3.2" @@ -1975,6 +2001,14 @@ name = "safemem" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "same-file" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "schannel" version = "0.1.13" @@ -2718,6 +2752,16 @@ name = "void" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "walkdir" +version = "2.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "want" version = "0.0.6" @@ -2752,6 +2796,14 @@ name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "winapi-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -2888,6 +2940,8 @@ dependencies = [ "checksum ghost 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5297b71943dc9fea26a3241b178c140ee215798b7f79f7773fd61683e25bca74" "checksum git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c7339329bfa14a00223244311560d11f8f489b453fb90092af97f267a6090ab0" "checksum h2 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "ddb2b25a33e231484694267af28fec74ac63b5ccf51ee2065a5e313b834d836e" +"checksum handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "df044dd42cdb7e32f28557b661406fc0f2494be75199779998810dbc35030e0d" +"checksum hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e1de41fb8dba9714efd92241565cdff73f78508c95697dd56787d3cba27e2353" "checksum hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "805026a5d0141ffc30abb3be3173848ad46a1b1664fe632428479619a3644d77" "checksum hostname 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "21ceb46a83a85e824ef93669c8b390009623863b5c195d1ba747292c0c72f94e" "checksum html5ever 0.22.5 (registry+https://github.com/rust-lang/crates.io-index)" = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e" @@ -3007,6 +3061,7 @@ dependencies = [ "checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" "checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f" "checksum safemem 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8dca453248a96cb0749e36ccdfe2b0b4e54a61bfef89fb97ec621eb8e0a93dd9" +"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum schannel 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "dc1fabf2a7b6483a141426e1afd09ad543520a77ac49bd03c286e7696ccfd77f" "checksum scheduled-thread-pool 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a2ff3fc5223829be817806c6441279c676e454cc7da608faf03b0ccc09d3889" "checksum scoped-tls 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f417c22df063e9450888a7561788e9bd46d3bb3c1466435b4eccb903807f147d" @@ -3091,11 +3146,13 @@ dependencies = [ "checksum vcpkg 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9e0a7d8bed3178a8fb112199d466eeca9ed09a14ba8ad67718179b4fd5487d0b" "checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +"checksum walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9658c94fa8b940eab2250bd5a457f9c48b748420d71293b165c8cdbe2f55f71e" "checksum want 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "797464475f30ddb8830cc529aaaae648d581f99e2036a928877dfde027ddf6b3" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" "checksum wincolor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" "checksum winutil 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7daf138b6b14196e3830a588acf1e86966c694d3e8fb026fb105b8b5dca07e6e" diff --git a/Cargo.toml b/Cargo.toml index 47c75e20f6b..8fe086e9b30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,6 +84,7 @@ hyper = "0.12" ctrlc = { version = "3.0", features = ["termination"] } indexmap = "1.0.2" scopeguard = "0.3.3" +handlebars = "2.0.1" [dev-dependencies] conduit-test = "0.8" diff --git a/src/tasks/dump-db.toml b/src/tasks/dump-db.toml index 6465d53bb99..2760151e71b 100644 --- a/src/tasks/dump-db.toml +++ b/src/tasks/dump-db.toml @@ -144,8 +144,7 @@ id in ( SELECT owner_id AS user_id FROM crate_owners WHERE NOT deleted AND owner_kind = 0 UNION SELECT published_by as user_id FROM versions -) -""" +)""" [users.columns] id = "public" email = "private" diff --git a/src/tasks/dump-export.hbs b/src/tasks/dump-export.hbs new file mode 100644 index 00000000000..a4c2252c761 --- /dev/null +++ b/src/tasks/dump-export.hbs @@ -0,0 +1,21 @@ +BEGIN; +{{~#each tables}} +{{~#if this.filter}} + CREATE TEMPORARY VIEW "dump_db_{{@key}}" AS ( + SELECT {{this.columns}} + FROM "{{@key}}" + WHERE {{this.filter}} + ); +{{~/if}} +{{~/each}} +COMMIT; + +BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; +{{~#each tables}} +{{~#if this.filter}} + \copy (SELECT * FROM "dump_db_{{@key}}") TO '{{@key}}.csv' WITH CSV HEADER +{{~else}} + \copy "{{@key}}" ({{this.columns}}) TO '{{@key}}.csv' WITH CSV HEADER +{{~/if}} +{{~/each}} +COMMIT; diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index c5d2ffa8568..2ac548f8f21 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -60,50 +60,6 @@ impl TableConfig { .collect::>() .join(", ") } - - fn view_sql(&self, table: &str) -> String { - self.filter - .as_ref() - .map(|filter| { - format!( - r#" - CREATE TEMPORARY VIEW "dump_db_{table}" AS ( - SELECT {columns} - FROM "{table}" - WHERE {filter} - ); - "#, - table = table, - columns = self.columns_str(), - filter = filter, - ) - }) - .unwrap_or_default() - } - - fn copy_sql(&self, table: &str) -> String { - if self.filter.is_some() { - format!( - r#" - \copy (SELECT * FROM "dump_db_{table}") TO '{table}.csv' WITH CSV HEADER - "#, - table = table, - ) - } else { - let cols_str = self.columns_str(); - if cols_str.is_empty() { - String::new() - } else { - format!( - r#" - \copy "{table}" ({columns}) TO '{table}.csv' WITH CSV HEADER - "#, - table = table, - columns = cols_str, - ) - } - } - } } /// Maps table names to the respective configurations. Used to load `dump_db.toml`. @@ -113,30 +69,39 @@ struct VisibilityConfig(BTreeMap); impl VisibilityConfig { fn gen_psql_script(&self) -> String { - let view_sql = self - .0 - .iter() - .map(|(table, config)| config.view_sql(table)) - .collect::>() - .concat(); - let copy_sql = self + #[derive(Serialize)] + struct TableContext<'a> { + filter: Option<&'a str>, + columns: String, + } + #[derive(Serialize)] + struct Context<'a> { + tables: BTreeMap<&'a str, TableContext<'a>>, + } + let tables = self .0 .iter() - .map(|(table, config)| config.copy_sql(table)) - .collect::>() - .concat(); - format!( - r#" - BEGIN; - {view_sql} - COMMIT; - BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; - {copy_sql} - COMMIT; - "#, - view_sql = view_sql, - copy_sql = copy_sql, - ) + .filter_map(|(table, config)| { + let columns = config.columns_str(); + if columns.is_empty() { + None + } else { + Some(( + table.as_str(), + TableContext { + filter: config.filter.as_ref().map(String::as_str), + columns, + }, + )) + } + }) + .collect(); + let context = Context { tables }; + let mut handlebars = handlebars::Handlebars::new(); + handlebars.register_escape_fn(handlebars::no_escape); + handlebars + .render_template(include_str!("dump-export.hbs"), &context) + .unwrap() } } From ee888f87df165da67d20b2ad36efa67573a6d1ca Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 29 Aug 2019 11:48:28 +0200 Subject: [PATCH 11/32] Add code to generate import script for dumps. --- src/tasks/dump-db.toml | 18 ++++++ src/tasks/dump-export.hbs | 8 +-- src/tasks/dump-import.hbs | 8 +++ src/tasks/dump_db.rs | 129 +++++++++++++++++++++++++++----------- 4 files changed, 123 insertions(+), 40 deletions(-) create mode 100644 src/tasks/dump-import.hbs diff --git a/src/tasks/dump-db.toml b/src/tasks/dump-db.toml index 2760151e71b..00d34976042 100644 --- a/src/tasks/dump-db.toml +++ b/src/tasks/dump-db.toml @@ -7,6 +7,11 @@ # # .filter - a string that is a valid SQL expression, which is used # in a WHERE clause to filter the rows of the table. +# +# .dependencies - an array of table names, used to determine the +# order of the tables in the generated import script. All tables referred +# to by public columns in the current table should be listed, to make sure +# they are imported before this table. [api_tokens.columns] id = "private" @@ -46,6 +51,7 @@ crate_id = "private" created_at = "private" [crate_owners] +dependencies = ["crates", "users"] filter = "NOT deleted" [crate_owners.columns] crate_id = "public" @@ -71,14 +77,20 @@ license = "public" repository = "public" max_upload_size = "public" +[crates_categories] +dependencies = ["categories", "crates"] [crates_categories.columns] crate_id = "public" category_id = "public" +[crates_keywords] +dependencies = ["crates", "keywords"] [crates_keywords.columns] crate_id = "public" keyword_id = "public" +[dependencies] +dependencies = ["crates", "versions"] [dependencies.columns] id = "public" version_id = "public" @@ -154,12 +166,16 @@ name = "public" gh_avatar = "public" gh_id = "public" +[version_authors] +dependencies = ["users", "versions"] [version_authors.columns] id = "public" version_id = "public" user_id = "private" name = "public" +[version_downloads] +dependencies = ["versions"] [version_downloads.columns] version_id = "public" downloads = "public" @@ -167,6 +183,8 @@ counted = "private" date = "public" processed = "private" +[versions] +dependencies = ["crates", "users"] [versions.columns] id = "public" crate_id = "public" diff --git a/src/tasks/dump-export.hbs b/src/tasks/dump-export.hbs index a4c2252c761..fe3e1a38a04 100644 --- a/src/tasks/dump-export.hbs +++ b/src/tasks/dump-export.hbs @@ -1,9 +1,9 @@ BEGIN; {{~#each tables}} {{~#if this.filter}} - CREATE TEMPORARY VIEW "dump_db_{{@key}}" AS ( + CREATE TEMPORARY VIEW "dump_db_{{this.name}}" AS ( SELECT {{this.columns}} - FROM "{{@key}}" + FROM "{{this.name}}" WHERE {{this.filter}} ); {{~/if}} @@ -13,9 +13,9 @@ COMMIT; BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; {{~#each tables}} {{~#if this.filter}} - \copy (SELECT * FROM "dump_db_{{@key}}") TO '{{@key}}.csv' WITH CSV HEADER + \copy (SELECT * FROM "dump_db_{{this.name}}") TO '{{this.name}}.csv' WITH CSV HEADER {{~else}} - \copy "{{@key}}" ({{this.columns}}) TO '{{@key}}.csv' WITH CSV HEADER + \copy "{{this.name}}" ({{this.columns}}) TO '{{this.name}}.csv' WITH CSV HEADER {{~/if}} {{~/each}} COMMIT; diff --git a/src/tasks/dump-import.hbs b/src/tasks/dump-import.hbs new file mode 100644 index 00000000000..f244bc7aa3b --- /dev/null +++ b/src/tasks/dump-import.hbs @@ -0,0 +1,8 @@ +BEGIN; +{{~#each tables}} + TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE; +{{~/each}} +{{~#each tables}} + \copy "{{this.name}}" ({{this.columns}}) FROM '{{this.name}}.csv' WITH CSV HEADER +{{~/each}} +COMMIT; diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 2ac548f8f21..94727d20ec4 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -1,5 +1,5 @@ use std::{ - collections::BTreeMap, + collections::{BTreeMap, VecDeque}, path::{Path, PathBuf}, }; @@ -42,23 +42,46 @@ enum ColumnVisibility { Public, } -/// Filtering information for a single table. The `filter` field is a valid SQL -/// expression used in a `WHERE` clause to filter the rows of the table. The -/// `columns` field maps column names to their respective visibilities. +/// Filtering information for a single table. The `dependencies` field is only +/// used to determine the order of the tables in the generated import script, +/// and should list all tables the current tables refers to with foreign key +/// constraints on public columns. The `filter` field is a valid SQL expression +/// used in a `WHERE` clause to filter the rows of the table. The `columns` +/// field maps column names to their respective visibilities. #[derive(Clone, Debug, Deserialize)] struct TableConfig { + #[serde(default)] + dependencies: Vec, filter: Option, columns: BTreeMap, } +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsTableContext<'a> { + name: &'a str, + filter: Option<&'a str>, + columns: String, +} + impl TableConfig { - fn columns_str(&self) -> String { - self.columns + fn handlebars_context<'a>(&'a self, name: &'a str) -> Option> { + let columns = self + .columns .iter() .filter(|&(_, &vis)| vis == ColumnVisibility::Public) .map(|(col, _)| format!("\"{}\"", col)) .collect::>() - .join(", ") + .join(", "); + if columns.is_empty() { + None + } else { + Some(HandlebarsTableContext { + name, + filter: self.filter.as_ref().map(String::as_str), + columns, + }) + } } } @@ -67,41 +90,75 @@ impl TableConfig { #[serde(transparent)] struct VisibilityConfig(BTreeMap); +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsContext<'a> { + tables: Vec>, +} + impl VisibilityConfig { - fn gen_psql_script(&self) -> String { - #[derive(Serialize)] - struct TableContext<'a> { - filter: Option<&'a str>, - columns: String, - } - #[derive(Serialize)] - struct Context<'a> { - tables: BTreeMap<&'a str, TableContext<'a>>, - } - let tables = self + /// Sort the tables in a way that dependencies come before dependent tables. + /// + /// Returns a vector of table names. + fn topological_sort(&self) -> Vec<&str> { + let mut result = Vec::new(); + let mut num_deps = BTreeMap::new(); + let mut rev_deps: BTreeMap<&str, Vec<&str>> = self .0 .iter() - .filter_map(|(table, config)| { - let columns = config.columns_str(); - if columns.is_empty() { - None - } else { - Some(( - table.as_str(), - TableContext { - filter: config.filter.as_ref().map(String::as_str), - columns, - }, - )) + .map(|(table, _)| (table.as_str(), vec![])) + .collect(); + for (table, config) in self.0.iter() { + num_deps.insert(table.as_str(), config.dependencies.len()); + for dep in &config.dependencies { + rev_deps.get_mut(dep.as_str()).unwrap().push(table.as_str()); + } + } + let mut ready: VecDeque<&str> = num_deps + .iter() + .filter(|(_, &count)| count == 0) + .map(|(&table, _)| table) + .collect(); + while let Some(table) = ready.pop_front() { + result.push(table); + for dep in &rev_deps[table] { + *num_deps.get_mut(dep).unwrap() -= 1; + if num_deps[dep] == 0 { + ready.push_back(dep); } - }) + } + } + assert_eq!( + self.0.len(), + result.len(), + "circular dependencies in DB dump config detected", + ); + result + } + + fn handlebars_context(&self) -> HandlebarsContext<'_> { + let tables = self + .topological_sort() + .into_iter() + .filter_map(|table| self.0[table].handlebars_context(table)) .collect(); - let context = Context { tables }; + HandlebarsContext { tables } + } + + fn render_template(&self, template: &str) -> String { + let context = self.handlebars_context(); let mut handlebars = handlebars::Handlebars::new(); handlebars.register_escape_fn(handlebars::no_escape); - handlebars - .render_template(include_str!("dump-export.hbs"), &context) - .unwrap() + handlebars.render_template(template, &context).unwrap() + } + + fn gen_export_script(&self) -> String { + self.render_template(include_str!("dump-export.hbs")) + } + + #[allow(dead_code)] + fn gen_import_script(&self) -> String { + self.render_template(include_str!("dump-import.hbs")) } } @@ -113,7 +170,7 @@ fn run_psql( use std::io::prelude::*; use std::process::{Command, Stdio}; - let psql_script = config.gen_psql_script(); + let psql_script = config.gen_export_script(); let mut psql = Command::new("psql") .arg(database_url) .current_dir(export_dir) From bcbccd6781bc1a631afef12080a03a56c0c70f27 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 29 Aug 2019 21:46:54 +0200 Subject: [PATCH 12/32] Include import and export scripts in the database dump. --- src/tasks/dump-export.hbs | 3 ++ src/tasks/dump-import.hbs | 16 ++++++++++ src/tasks/dump_db.rs | 62 +++++++++++++++++++-------------------- 3 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/tasks/dump-export.hbs b/src/tasks/dump-export.hbs index fe3e1a38a04..fa8582ad825 100644 --- a/src/tasks/dump-export.hbs +++ b/src/tasks/dump-export.hbs @@ -1,3 +1,6 @@ +-- This script was used to create this database dump, and is only included in +-- the archive for reference. + BEGIN; {{~#each tables}} {{~#if this.filter}} diff --git a/src/tasks/dump-import.hbs b/src/tasks/dump-import.hbs index f244bc7aa3b..1700810e1c2 100644 --- a/src/tasks/dump-import.hbs +++ b/src/tasks/dump-import.hbs @@ -1,3 +1,19 @@ +-- Script for psql to restore the dump into a local crates.io database. +-- +-- WARNING: This will destroy the current database contents. +-- +-- Instructions: +-- +-- 1. Create a new database and run the Diesel migrations. +-- +-- createdb DATABASE_NAME +-- diesel migration run --database-url DATABASE_URL +-- +-- 2. Run this script. +-- +-- cd DUMP_DIRECTORY +-- psql DATABASE_URL < import.sql + BEGIN; {{~#each tables}} TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE; diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 94727d20ec4..03c44ea1bdb 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -1,5 +1,6 @@ use std::{ collections::{BTreeMap, VecDeque}, + fs::File, path::{Path, PathBuf}, }; @@ -22,8 +23,11 @@ pub fn dump_db( defer! {{ std::fs::remove_dir_all(&export_dir).unwrap(); }} - let visibility_config = toml::from_str(include_str!("dump-db.toml")).unwrap(); - run_psql(&visibility_config, &database_url, &export_dir)?; + let visibility_config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap(); + let export_script = export_dir.join("export.sql"); + let import_script = export_dir.join("import.sql"); + visibility_config.gen_psql_scripts(&export_script, &import_script)?; + run_psql(&database_url, &export_script)?; let tarball = create_tarball(&export_dir)?; defer! {{ std::fs::remove_file(&tarball).unwrap(); @@ -145,46 +149,42 @@ impl VisibilityConfig { HandlebarsContext { tables } } - fn render_template(&self, template: &str) -> String { + fn gen_psql_scripts( + &self, + export_script: &Path, + import_script: &Path, + ) -> Result<(), PerformError> { let context = self.handlebars_context(); let mut handlebars = handlebars::Handlebars::new(); handlebars.register_escape_fn(handlebars::no_escape); - handlebars.render_template(template, &context).unwrap() - } - - fn gen_export_script(&self) -> String { - self.render_template(include_str!("dump-export.hbs")) - } - - #[allow(dead_code)] - fn gen_import_script(&self) -> String { - self.render_template(include_str!("dump-import.hbs")) + let export_sql = File::create(export_script)?; + handlebars.render_template_to_write( + include_str!("dump-export.hbs"), + &context, + export_sql, + )?; + let import_sql = File::create(import_script)?; + handlebars.render_template_to_write( + include_str!("dump-import.hbs"), + &context, + import_sql, + )?; + Ok(()) } } -fn run_psql( - config: &VisibilityConfig, - database_url: &str, - export_dir: &Path, -) -> Result<(), PerformError> { - use std::io::prelude::*; +fn run_psql(database_url: &str, export_script: &Path) -> Result<(), PerformError> { use std::process::{Command, Stdio}; - let psql_script = config.gen_export_script(); - let mut psql = Command::new("psql") + let psql_script = File::open(export_script)?; + let psql = Command::new("psql") .arg(database_url) - .current_dir(export_dir) - .stdin(Stdio::piped()) + .current_dir(export_script.parent().unwrap()) + .stdin(psql_script) .stdout(Stdio::null()) .stderr(Stdio::piped()) .spawn()?; - let mut stdin = psql.stdin.take().unwrap(); - let input_thread = std::thread::spawn(move || -> std::io::Result<()> { - stdin.write_all(psql_script.as_bytes())?; - Ok(()) - }); let output = psql.wait_with_output()?; - input_thread.join().unwrap()?; if !output.stderr.is_empty() { Err(format!( "Error while executing psql: {}", @@ -199,7 +199,7 @@ fn run_psql( fn create_tarball(export_dir: &Path) -> Result { let tarball_name = export_dir.with_extension("tar.gz"); - let tarball = std::fs::File::create(&tarball_name)?; + let tarball = File::create(&tarball_name)?; let encoder = flate2::write::GzEncoder::new(tarball, flate2::Compression::default()); let mut archive = tar::Builder::new(encoder); archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?; @@ -212,7 +212,7 @@ fn upload_tarball( uploader: &Uploader, ) -> Result<(), PerformError> { let client = reqwest::Client::new(); - let tarfile = std::fs::File::open(tarball)?; + let tarfile = File::open(tarball)?; let content_length = tarfile.metadata()?.len(); // TODO Figure out the correct content type. uploader From 3eff6dc6c60d73be4609140103a098e969dc6cfb Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Mon, 2 Sep 2019 20:09:30 +0200 Subject: [PATCH 13/32] Set default value for gh_access_token in import script. --- src/tasks/dump-import.hbs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tasks/dump-import.hbs b/src/tasks/dump-import.hbs index 1700810e1c2..73708508309 100644 --- a/src/tasks/dump-import.hbs +++ b/src/tasks/dump-import.hbs @@ -15,10 +15,12 @@ -- psql DATABASE_URL < import.sql BEGIN; -{{~#each tables}} + -- Set defaults for non-nullable columns not included in the dump. + ALTER TABLE users ALTER COLUMN gh_access_token SET DEFAULT ''; +{{#each tables}} TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE; {{~/each}} -{{~#each tables}} +{{#each tables}} \copy "{{this.name}}" ({{this.columns}}) FROM '{{this.name}}.csv' WITH CSV HEADER {{~/each}} COMMIT; From b085a783779b59d4bb59c43053098760eb10fecf Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 3 Sep 2019 09:29:14 +0200 Subject: [PATCH 14/32] Simplify topological sort implementation. --- src/tasks/dump_db.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 03c44ea1bdb..12f23672aab 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -107,15 +107,11 @@ impl VisibilityConfig { fn topological_sort(&self) -> Vec<&str> { let mut result = Vec::new(); let mut num_deps = BTreeMap::new(); - let mut rev_deps: BTreeMap<&str, Vec<&str>> = self - .0 - .iter() - .map(|(table, _)| (table.as_str(), vec![])) - .collect(); + let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); for (table, config) in self.0.iter() { num_deps.insert(table.as_str(), config.dependencies.len()); for dep in &config.dependencies { - rev_deps.get_mut(dep.as_str()).unwrap().push(table.as_str()); + rev_deps.entry(dep.as_str()).or_default().push(table.as_str()); } } let mut ready: VecDeque<&str> = num_deps @@ -125,7 +121,7 @@ impl VisibilityConfig { .collect(); while let Some(table) = ready.pop_front() { result.push(table); - for dep in &rev_deps[table] { + for dep in rev_deps.get(table).iter().copied().flatten() { *num_deps.get_mut(dep).unwrap() -= 1; if num_deps[dep] == 0 { ready.push_back(dep); @@ -135,7 +131,7 @@ impl VisibilityConfig { assert_eq!( self.0.len(), result.len(), - "circular dependencies in DB dump config detected", + "circular dependencies in database dump configuration detected", ); result } From b4ad1f1614b44914cafa2406cd8355b8c296bd98 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 3 Sep 2019 14:09:49 +0200 Subject: [PATCH 15/32] Move psql script generation code to submodule. --- src/tasks/dump_db.rs | 211 +--------------------- src/tasks/{ => dump_db}/dump-db.toml | 0 src/tasks/{ => dump_db}/dump-export.hbs | 0 src/tasks/{ => dump_db}/dump-import.hbs | 0 src/tasks/dump_db/gen_scripts.rs | 221 ++++++++++++++++++++++++ 5 files changed, 223 insertions(+), 209 deletions(-) rename src/tasks/{ => dump_db}/dump-db.toml (100%) rename src/tasks/{ => dump_db}/dump-export.hbs (100%) rename src/tasks/{ => dump_db}/dump-import.hbs (100%) create mode 100644 src/tasks/dump_db/gen_scripts.rs diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 12f23672aab..13a3f2a97c5 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -1,5 +1,4 @@ use std::{ - collections::{BTreeMap, VecDeque}, fs::File, path::{Path, PathBuf}, }; @@ -23,10 +22,9 @@ pub fn dump_db( defer! {{ std::fs::remove_dir_all(&export_dir).unwrap(); }} - let visibility_config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap(); let export_script = export_dir.join("export.sql"); let import_script = export_dir.join("import.sql"); - visibility_config.gen_psql_scripts(&export_script, &import_script)?; + gen_scripts::gen_scripts(&export_script, &import_script)?; run_psql(&database_url, &export_script)?; let tarball = create_tarball(&export_dir)?; defer! {{ @@ -37,138 +35,6 @@ pub fn dump_db( Ok(()) } -/// An enum indicating whether a column is included in the database dumps. -/// Public columns are included, private are not. -#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] -#[serde(rename_all = "lowercase")] -enum ColumnVisibility { - Private, - Public, -} - -/// Filtering information for a single table. The `dependencies` field is only -/// used to determine the order of the tables in the generated import script, -/// and should list all tables the current tables refers to with foreign key -/// constraints on public columns. The `filter` field is a valid SQL expression -/// used in a `WHERE` clause to filter the rows of the table. The `columns` -/// field maps column names to their respective visibilities. -#[derive(Clone, Debug, Deserialize)] -struct TableConfig { - #[serde(default)] - dependencies: Vec, - filter: Option, - columns: BTreeMap, -} - -/// Subset of the configuration data to be passed on to the Handlbars template. -#[derive(Debug, Serialize)] -struct HandlebarsTableContext<'a> { - name: &'a str, - filter: Option<&'a str>, - columns: String, -} - -impl TableConfig { - fn handlebars_context<'a>(&'a self, name: &'a str) -> Option> { - let columns = self - .columns - .iter() - .filter(|&(_, &vis)| vis == ColumnVisibility::Public) - .map(|(col, _)| format!("\"{}\"", col)) - .collect::>() - .join(", "); - if columns.is_empty() { - None - } else { - Some(HandlebarsTableContext { - name, - filter: self.filter.as_ref().map(String::as_str), - columns, - }) - } - } -} - -/// Maps table names to the respective configurations. Used to load `dump_db.toml`. -#[derive(Clone, Debug, Deserialize)] -#[serde(transparent)] -struct VisibilityConfig(BTreeMap); - -/// Subset of the configuration data to be passed on to the Handlbars template. -#[derive(Debug, Serialize)] -struct HandlebarsContext<'a> { - tables: Vec>, -} - -impl VisibilityConfig { - /// Sort the tables in a way that dependencies come before dependent tables. - /// - /// Returns a vector of table names. - fn topological_sort(&self) -> Vec<&str> { - let mut result = Vec::new(); - let mut num_deps = BTreeMap::new(); - let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); - for (table, config) in self.0.iter() { - num_deps.insert(table.as_str(), config.dependencies.len()); - for dep in &config.dependencies { - rev_deps.entry(dep.as_str()).or_default().push(table.as_str()); - } - } - let mut ready: VecDeque<&str> = num_deps - .iter() - .filter(|(_, &count)| count == 0) - .map(|(&table, _)| table) - .collect(); - while let Some(table) = ready.pop_front() { - result.push(table); - for dep in rev_deps.get(table).iter().copied().flatten() { - *num_deps.get_mut(dep).unwrap() -= 1; - if num_deps[dep] == 0 { - ready.push_back(dep); - } - } - } - assert_eq!( - self.0.len(), - result.len(), - "circular dependencies in database dump configuration detected", - ); - result - } - - fn handlebars_context(&self) -> HandlebarsContext<'_> { - let tables = self - .topological_sort() - .into_iter() - .filter_map(|table| self.0[table].handlebars_context(table)) - .collect(); - HandlebarsContext { tables } - } - - fn gen_psql_scripts( - &self, - export_script: &Path, - import_script: &Path, - ) -> Result<(), PerformError> { - let context = self.handlebars_context(); - let mut handlebars = handlebars::Handlebars::new(); - handlebars.register_escape_fn(handlebars::no_escape); - let export_sql = File::create(export_script)?; - handlebars.render_template_to_write( - include_str!("dump-export.hbs"), - &context, - export_sql, - )?; - let import_sql = File::create(import_script)?; - handlebars.render_template_to_write( - include_str!("dump-import.hbs"), - &context, - import_sql, - )?; - Ok(()) - } -} - fn run_psql(database_url: &str, export_script: &Path) -> Result<(), PerformError> { use std::process::{Command, Stdio}; @@ -223,77 +89,4 @@ fn upload_tarball( Ok(()) } -#[cfg(test)] -mod tests { - use super::*; - use crate::test_util::pg_connection; - use diesel::prelude::*; - use std::collections::HashSet; - - /// Test whether the schema in the visibility configuration matches the test database. - #[test] - fn check_visibility_config() { - let conn = pg_connection(); - let db_columns: HashSet<_> = get_db_columns(&conn) - .into_iter() - .map(|c| (c.table_name, c.column_name)) - .collect(); - let visibility_config: VisibilityConfig = - toml::from_str(include_str!("dump-db.toml")).unwrap(); - let vis_columns: HashSet<_> = visibility_config - .0 - .iter() - .flat_map(|(table, config)| { - config - .columns - .iter() - .map(move |(column, _)| (table.clone(), column.clone())) - }) - .collect(); - let mut errors = vec![]; - for (table, col) in db_columns.difference(&vis_columns) { - errors.push(format!( - "No visibility information for columns {}.{}.", - table, col - )); - } - for (table, col) in vis_columns.difference(&db_columns) { - errors.push(format!( - "Column {}.{} does not exist in the database.", - table, col - )); - } - assert!( - errors.is_empty(), - "The visibility configuration does not match the database schema:\n{}", - errors.join("\n - "), - ); - } - - mod information_schema { - table! { - information_schema.columns (table_schema, table_name, column_name) { - table_schema -> Text, - table_name -> Text, - column_name -> Text, - ordinal_position -> Integer, - } - } - } - - #[derive(Debug, PartialEq, Queryable)] - struct Column { - table_name: String, - column_name: String, - } - - fn get_db_columns(conn: &PgConnection) -> Vec { - use information_schema::columns::dsl::*; - columns - .select((table_name, column_name)) - .filter(table_schema.eq("public")) - .order_by((table_name, ordinal_position)) - .load(conn) - .unwrap() - } -} +mod gen_scripts; diff --git a/src/tasks/dump-db.toml b/src/tasks/dump_db/dump-db.toml similarity index 100% rename from src/tasks/dump-db.toml rename to src/tasks/dump_db/dump-db.toml diff --git a/src/tasks/dump-export.hbs b/src/tasks/dump_db/dump-export.hbs similarity index 100% rename from src/tasks/dump-export.hbs rename to src/tasks/dump_db/dump-export.hbs diff --git a/src/tasks/dump-import.hbs b/src/tasks/dump_db/dump-import.hbs similarity index 100% rename from src/tasks/dump-import.hbs rename to src/tasks/dump_db/dump-import.hbs diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs new file mode 100644 index 00000000000..aa2267b2c4c --- /dev/null +++ b/src/tasks/dump_db/gen_scripts.rs @@ -0,0 +1,221 @@ +use std::{ + collections::{BTreeMap, VecDeque}, + fs::File, + path::Path, +}; + +use swirl::PerformError; + +pub fn gen_scripts(export_script: &Path, import_script: &Path) -> Result<(), PerformError> { + let config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap(); + let export_sql = File::create(export_script)?; + let import_sql = File::create(import_script)?; + config.gen_psql_scripts(export_sql, import_sql) +} + +/// An enum indicating whether a column is included in the database dumps. +/// Public columns are included, private are not. +#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] +#[serde(rename_all = "lowercase")] +enum ColumnVisibility { + Private, + Public, +} + +/// Filtering information for a single table. The `dependencies` field is only +/// used to determine the order of the tables in the generated import script, +/// and should list all tables the current tables refers to with foreign key +/// constraints on public columns. The `filter` field is a valid SQL expression +/// used in a `WHERE` clause to filter the rows of the table. The `columns` +/// field maps column names to their respective visibilities. +#[derive(Clone, Debug, Deserialize)] +struct TableConfig { + #[serde(default)] + dependencies: Vec, + filter: Option, + columns: BTreeMap, +} + +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsTableContext<'a> { + name: &'a str, + filter: Option<&'a str>, + columns: String, +} + +impl TableConfig { + fn handlebars_context<'a>(&'a self, name: &'a str) -> Option> { + let columns = self + .columns + .iter() + .filter(|&(_, &vis)| vis == ColumnVisibility::Public) + .map(|(col, _)| format!("\"{}\"", col)) + .collect::>() + .join(", "); + if columns.is_empty() { + None + } else { + Some(HandlebarsTableContext { + name, + filter: self.filter.as_ref().map(String::as_str), + columns, + }) + } + } +} + +/// Maps table names to the respective configurations. Used to load `dump_db.toml`. +#[derive(Clone, Debug, Deserialize)] +#[serde(transparent)] +struct VisibilityConfig(BTreeMap); + +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsContext<'a> { + tables: Vec>, +} + +impl VisibilityConfig { + /// Sort the tables in a way that dependencies come before dependent tables. + /// + /// Returns a vector of table names. + fn topological_sort(&self) -> Vec<&str> { + let mut result = Vec::new(); + let mut num_deps = BTreeMap::new(); + let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); + for (table, config) in self.0.iter() { + num_deps.insert(table.as_str(), config.dependencies.len()); + for dep in &config.dependencies { + rev_deps + .entry(dep.as_str()) + .or_default() + .push(table.as_str()); + } + } + let mut ready: VecDeque<&str> = num_deps + .iter() + .filter(|(_, &count)| count == 0) + .map(|(&table, _)| table) + .collect(); + while let Some(table) = ready.pop_front() { + result.push(table); + for dep in rev_deps.get(table).iter().copied().flatten() { + *num_deps.get_mut(dep).unwrap() -= 1; + if num_deps[dep] == 0 { + ready.push_back(dep); + } + } + } + assert_eq!( + self.0.len(), + result.len(), + "circular dependencies in database dump configuration detected", + ); + result + } + + fn handlebars_context(&self) -> HandlebarsContext<'_> { + let tables = self + .topological_sort() + .into_iter() + .filter_map(|table| self.0[table].handlebars_context(table)) + .collect(); + HandlebarsContext { tables } + } + + fn gen_psql_scripts(&self, export_sql: W, import_sql: W) -> Result<(), PerformError> + where + W: std::io::Write, + { + let context = self.handlebars_context(); + let mut handlebars = handlebars::Handlebars::new(); + handlebars.register_escape_fn(handlebars::no_escape); + handlebars.render_template_to_write( + include_str!("dump-export.hbs"), + &context, + export_sql, + )?; + handlebars.render_template_to_write( + include_str!("dump-import.hbs"), + &context, + import_sql, + )?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::pg_connection; + use diesel::prelude::*; + use std::collections::HashSet; + + /// Test whether the schema in the visibility configuration matches the test database. + #[test] + fn check_visibility_config() { + let conn = pg_connection(); + let db_columns: HashSet<_> = get_db_columns(&conn) + .into_iter() + .map(|c| (c.table_name, c.column_name)) + .collect(); + let visibility_config: VisibilityConfig = + toml::from_str(include_str!("dump-db.toml")).unwrap(); + let vis_columns: HashSet<_> = visibility_config + .0 + .iter() + .flat_map(|(table, config)| { + config + .columns + .iter() + .map(move |(column, _)| (table.clone(), column.clone())) + }) + .collect(); + let mut errors = vec![]; + for (table, col) in db_columns.difference(&vis_columns) { + errors.push(format!( + "No visibility information for columns {}.{}.", + table, col + )); + } + for (table, col) in vis_columns.difference(&db_columns) { + errors.push(format!( + "Column {}.{} does not exist in the database.", + table, col + )); + } + assert!( + errors.is_empty(), + "The visibility configuration does not match the database schema:\n{}", + errors.join("\n - "), + ); + } + + mod information_schema { + table! { + information_schema.columns (table_schema, table_name, column_name) { + table_schema -> Text, + table_name -> Text, + column_name -> Text, + ordinal_position -> Integer, + } + } + } + + #[derive(Debug, PartialEq, Queryable)] + struct Column { + table_name: String, + column_name: String, + } + + fn get_db_columns(conn: &PgConnection) -> Vec { + use information_schema::columns::dsl::*; + columns + .select((table_name, column_name)) + .filter(table_schema.eq("public")) + .order_by((table_name, ordinal_position)) + .load(conn) + .unwrap() + } +} From 3b477c9ec7adbdb097477f3bbb30bfa72be9f81c Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 3 Sep 2019 15:01:46 +0200 Subject: [PATCH 16/32] Refactor dump_db code to use Drop for cleanup. --- Cargo.lock | 1 - Cargo.toml | 1 - src/tasks/dump_db.rs | 146 +++++++++++++++++++++++++------------------ 3 files changed, 85 insertions(+), 63 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bcf03930d65..c180eeb8308 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -218,7 +218,6 @@ dependencies = [ "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "reqwest 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)", "scheduled-thread-pool 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "semver 0.9.0 (git+https://github.com/steveklabnik/semver.git)", "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 8fe086e9b30..8c9eaebb343 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,7 +83,6 @@ tokio = "0.1" hyper = "0.12" ctrlc = { version = "3.0", features = ["termination"] } indexmap = "1.0.2" -scopeguard = "0.3.3" handlebars = "2.0.1" [dev-dependencies] diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 13a3f2a97c5..2b33ef3c155 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -5,7 +5,6 @@ use std::{ use crate::{background_jobs::Environment, uploaders::Uploader, util::errors::std_error_no_send}; -use scopeguard::defer; use swirl::PerformError; /// Create CSV dumps of the public information in the database, wrap them in a @@ -16,77 +15,102 @@ pub fn dump_db( database_url: String, target_name: String, ) -> Result<(), PerformError> { - let timestamp = chrono::Utc::now().format("%Y-%m-%d-%H%M%S").to_string(); - let export_dir = std::env::temp_dir().join("dump-db").join(timestamp); - std::fs::create_dir_all(&export_dir)?; - defer! {{ - std::fs::remove_dir_all(&export_dir).unwrap(); - }} - let export_script = export_dir.join("export.sql"); - let import_script = export_dir.join("import.sql"); - gen_scripts::gen_scripts(&export_script, &import_script)?; - run_psql(&database_url, &export_script)?; - let tarball = create_tarball(&export_dir)?; - defer! {{ - std::fs::remove_file(&tarball).unwrap(); - }} - upload_tarball(&tarball, &target_name, &env.uploader)?; + let directory = DumpDirectory::create()?; + directory.dump_db(&database_url)?; + let tarball = DumpTarball::create(&directory.export_dir)?; + tarball.upload(&target_name, &env.uploader)?; println!("Database dump uploaded to {}.", &target_name); Ok(()) } -fn run_psql(database_url: &str, export_script: &Path) -> Result<(), PerformError> { - use std::process::{Command, Stdio}; +/// Manage the export directory. +/// +/// Create the directory, populate it with the psql scripts and CSV dumps, and +/// make sure it gets deleted again even in the case of an error. +struct DumpDirectory { + export_dir: PathBuf, +} - let psql_script = File::open(export_script)?; - let psql = Command::new("psql") - .arg(database_url) - .current_dir(export_script.parent().unwrap()) - .stdin(psql_script) - .stdout(Stdio::null()) - .stderr(Stdio::piped()) - .spawn()?; - let output = psql.wait_with_output()?; - if !output.stderr.is_empty() { - Err(format!( - "Error while executing psql: {}", - String::from_utf8_lossy(&output.stderr) - ))?; +impl DumpDirectory { + fn create() -> Result { + let timestamp = chrono::Utc::now().format("%Y-%m-%d-%H%M%S").to_string(); + let export_dir = std::env::temp_dir().join("dump-db").join(timestamp); + std::fs::create_dir_all(&export_dir)?; + Ok(Self { export_dir }) } - if !output.status.success() { - Err("psql did not finish successfully.")?; + + fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + let export_script = self.export_dir.join("export.sql"); + let import_script = self.export_dir.join("import.sql"); + gen_scripts::gen_scripts(&export_script, &import_script)?; + let psql_script = File::open(&export_script)?; + let psql = std::process::Command::new("psql") + .arg(database_url) + .current_dir(export_script.parent().unwrap()) + .stdin(psql_script) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + let output = psql.wait_with_output()?; + if !output.stderr.is_empty() { + Err(format!( + "Error while executing psql: {}", + String::from_utf8_lossy(&output.stderr) + ))?; + } + if !output.status.success() { + Err("psql did not finish successfully.")?; + } + Ok(()) } - Ok(()) } -fn create_tarball(export_dir: &Path) -> Result { - let tarball_name = export_dir.with_extension("tar.gz"); - let tarball = File::create(&tarball_name)?; - let encoder = flate2::write::GzEncoder::new(tarball, flate2::Compression::default()); - let mut archive = tar::Builder::new(encoder); - archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?; - Ok(tarball_name) +impl Drop for DumpDirectory { + fn drop(&mut self) { + std::fs::remove_dir_all(&self.export_dir).unwrap(); + } } -fn upload_tarball( - tarball: &Path, - target_name: &str, - uploader: &Uploader, -) -> Result<(), PerformError> { - let client = reqwest::Client::new(); - let tarfile = File::open(tarball)?; - let content_length = tarfile.metadata()?.len(); - // TODO Figure out the correct content type. - uploader - .upload( - &client, - target_name, - tarfile, - content_length, - "application/gzip", - ) - .map_err(std_error_no_send)?; - Ok(()) +/// Manage the tarball of the database dump. +/// +/// Create the tarball, upload it to S3, and make sure it gets deleted. +struct DumpTarball { + tarball_path: PathBuf, +} + +impl DumpTarball { + fn create(export_dir: &Path) -> Result { + let tarball_path = export_dir.with_extension("tar.gz"); + let tarfile = File::create(&tarball_path)?; + let result = Self { tarball_path }; + let encoder = flate2::write::GzEncoder::new(tarfile, flate2::Compression::default()); + let mut archive = tar::Builder::new(encoder); + archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?; + Ok(result) + } + + fn upload(&self, target_name: &str, uploader: &Uploader) -> Result<(), PerformError> { + let client = reqwest::Client::new(); + let tarfile = File::open(&self.tarball_path)?; + let content_length = tarfile.metadata()?.len(); + // TODO Figure out the correct content type. + uploader + .upload( + &client, + target_name, + tarfile, + content_length, + "application/gzip", + ) + .map_err(std_error_no_send)?; + Ok(()) + } +} + +impl Drop for DumpTarball { + fn drop(&mut self) { + std::fs::remove_file(&self.tarball_path).unwrap(); + } } mod gen_scripts; From 4448ca8b58228c115b89bb571f55a756023aedf7 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 3 Sep 2019 21:24:32 +0200 Subject: [PATCH 17/32] Refactor schema check unit test. --- src/tasks/dump_db/gen_scripts.rs | 40 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index aa2267b2c4c..597d36f1f30 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -151,38 +151,44 @@ mod tests { use crate::test_util::pg_connection; use diesel::prelude::*; use std::collections::HashSet; + use std::iter::FromIterator; - /// Test whether the schema in the visibility configuration matches the test database. + /// Test whether the visibility configuration matches the schema of the + /// test database. #[test] fn check_visibility_config() { let conn = pg_connection(); - let db_columns: HashSet<_> = get_db_columns(&conn) - .into_iter() - .map(|c| (c.table_name, c.column_name)) - .collect(); - let visibility_config: VisibilityConfig = - toml::from_str(include_str!("dump-db.toml")).unwrap(); - let vis_columns: HashSet<_> = visibility_config + let db_columns = HashSet::::from_iter(get_db_columns(&conn)); + let vis_columns = toml::from_str::(include_str!("dump-db.toml")) + .unwrap() .0 .iter() .flat_map(|(table, config)| { - config - .columns - .iter() - .map(move |(column, _)| (table.clone(), column.clone())) + config.columns.iter().map(move |(column, _)| Column { + table_name: table.clone(), + column_name: column.clone(), + }) }) .collect(); let mut errors = vec![]; - for (table, col) in db_columns.difference(&vis_columns) { + for Column { + table_name, + column_name, + } in db_columns.difference(&vis_columns) + { errors.push(format!( "No visibility information for columns {}.{}.", - table, col + table_name, column_name )); } - for (table, col) in vis_columns.difference(&db_columns) { + for Column { + table_name, + column_name, + } in vis_columns.difference(&db_columns) + { errors.push(format!( "Column {}.{} does not exist in the database.", - table, col + table_name, column_name )); } assert!( @@ -203,7 +209,7 @@ mod tests { } } - #[derive(Debug, PartialEq, Queryable)] + #[derive(Debug, Eq, Hash, PartialEq, Queryable)] struct Column { table_name: String, column_name: String, From ede4fe05d02308cca7c919708de486a85cd68756 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 5 Sep 2019 20:36:10 +0200 Subject: [PATCH 18/32] Allow specifying column defaults in the DB dump config. --- src/tasks/dump_db/dump-db.toml | 7 +++++++ src/tasks/dump_db/dump-import.hbs | 21 ++++++++++++++++++--- src/tasks/dump_db/gen_scripts.rs | 12 +++++++++++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml index 00d34976042..83c305aa15a 100644 --- a/src/tasks/dump_db/dump-db.toml +++ b/src/tasks/dump_db/dump-db.toml @@ -12,6 +12,11 @@ # order of the tables in the generated import script. All tables referred # to by public columns in the current table should be listed, to make sure # they are imported before this table. +# +# .columns_defaults - a TOML dictionary mapping column names to a +# raw SQL expression that is used as the default value for the column on +# import. This is useful for private columns that are not nullable and do +# not have a default. [api_tokens.columns] id = "private" @@ -165,6 +170,8 @@ gh_login = "public" name = "public" gh_avatar = "public" gh_id = "public" +[users.column_defaults] +gh_access_token = "''" [version_authors] dependencies = ["users", "versions"] diff --git a/src/tasks/dump_db/dump-import.hbs b/src/tasks/dump_db/dump-import.hbs index 73708508309..fabbfeedf54 100644 --- a/src/tasks/dump_db/dump-import.hbs +++ b/src/tasks/dump_db/dump-import.hbs @@ -16,11 +16,26 @@ BEGIN; -- Set defaults for non-nullable columns not included in the dump. - ALTER TABLE users ALTER COLUMN gh_access_token SET DEFAULT ''; -{{#each tables}} +{{~#each tables as |table|}} +{{~#each column_defaults}} + ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" SET DEFAULT {{this}}; +{{~/each}} +{{~/each}} + + -- Truncate all tables. +{{~#each tables}} TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE; {{~/each}} -{{#each tables}} + + -- Import the CSV data. +{{~#each tables}} \copy "{{this.name}}" ({{this.columns}}) FROM '{{this.name}}.csv' WITH CSV HEADER +{{~/each}} + + -- Drop the defaults again. +{{~#each tables as |table|}} +{{~#each column_defaults}} + ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" DROP DEFAULT; +{{~/each}} {{~/each}} COMMIT; diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index 597d36f1f30..ec3f89d153b 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -34,6 +34,8 @@ struct TableConfig { dependencies: Vec, filter: Option, columns: BTreeMap, + #[serde(default)] + column_defaults: BTreeMap, } /// Subset of the configuration data to be passed on to the Handlbars template. @@ -42,6 +44,7 @@ struct HandlebarsTableContext<'a> { name: &'a str, filter: Option<&'a str>, columns: String, + column_defaults: BTreeMap<&'a str, &'a str>, } impl TableConfig { @@ -56,10 +59,17 @@ impl TableConfig { if columns.is_empty() { None } else { + let filter = self.filter.as_ref().map(String::as_str); + let column_defaults = self + .column_defaults + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())) + .collect(); Some(HandlebarsTableContext { name, - filter: self.filter.as_ref().map(String::as_str), + filter, columns, + column_defaults, }) } } From 1c4737bcacf8eb1b5eec7659aa2254b9edf47854 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 5 Sep 2019 20:48:48 +0200 Subject: [PATCH 19/32] Add basic unit tests for topological sorting. --- src/tasks/dump_db/gen_scripts.rs | 34 +++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index ec3f89d153b..ec50dacd8d9 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -28,7 +28,7 @@ enum ColumnVisibility { /// constraints on public columns. The `filter` field is a valid SQL expression /// used in a `WHERE` clause to filter the rows of the table. The `columns` /// field maps column names to their respective visibilities. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] struct TableConfig { #[serde(default)] dependencies: Vec, @@ -76,7 +76,7 @@ impl TableConfig { } /// Maps table names to the respective configurations. Used to load `dump_db.toml`. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] #[serde(transparent)] struct VisibilityConfig(BTreeMap); @@ -204,7 +204,7 @@ mod tests { assert!( errors.is_empty(), "The visibility configuration does not match the database schema:\n{}", - errors.join("\n - "), + errors.join("\n"), ); } @@ -234,4 +234,32 @@ mod tests { .load(conn) .unwrap() } + + fn table_config_with_deps(deps: &[&str]) -> TableConfig { + TableConfig { + dependencies: deps.iter().cloned().map(ToOwned::to_owned).collect(), + ..Default::default() + } + } + + #[test] + fn test_topological_sort() { + let mut config = VisibilityConfig::default(); + let tables = &mut config.0; + tables.insert("a".to_owned(), table_config_with_deps(&["b", "c"])); + tables.insert("b".to_owned(), table_config_with_deps(&["c", "d"])); + tables.insert("c".to_owned(), table_config_with_deps(&["d"])); + config.0.insert("d".to_owned(), table_config_with_deps(&[])); + assert_eq!(config.topological_sort(), ["d", "c", "b", "a"]); + } + + #[test] + #[should_panic] + fn topological_sort_panics_for_cyclic_dependency() { + let mut config = VisibilityConfig::default(); + let tables = &mut config.0; + tables.insert("a".to_owned(), table_config_with_deps(&["b"])); + tables.insert("b".to_owned(), table_config_with_deps(&["a"])); + config.topological_sort(); + } } From c373f4b5f9d34b92d4c66b9f8310d4c9b538cffd Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sun, 8 Sep 2019 15:38:15 +0200 Subject: [PATCH 20/32] Add prototype for dump_db integration test. --- Cargo.toml | 1 + .../2017-10-08-193512_category_trees/up.sql | 3 +- .../up.sql | 2 +- src/tasks.rs | 2 +- src/tasks/dump_db.rs | 49 ++++++++------- src/tests/all.rs | 1 + src/tests/dump_db.rs | 62 +++++++++++++++++++ src/tests/load_foreign_key_constraints.sql | 2 + 8 files changed, 95 insertions(+), 27 deletions(-) create mode 100644 src/tests/dump_db.rs diff --git a/Cargo.toml b/Cargo.toml index 8c9eaebb343..610276bde3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,6 +90,7 @@ conduit-test = "0.8" hyper-tls = "0.3" lazy_static = "1.0" tokio-core = "0.1" +diesel_migrations = { version = "1.3.0", features = ["postgres"] } [build-dependencies] dotenv = "0.11" diff --git a/migrations/2017-10-08-193512_category_trees/up.sql b/migrations/2017-10-08-193512_category_trees/up.sql index 579160446b4..0fe64abba14 100644 --- a/migrations/2017-10-08-193512_category_trees/up.sql +++ b/migrations/2017-10-08-193512_category_trees/up.sql @@ -1,5 +1,4 @@ --- Your SQL goes here -CREATE EXTENSION ltree; +CREATE EXTENSION IF NOT EXISTS ltree; -- Create the new column which will represent our category tree. -- Fill it with values from `slug` column and then set to non-null diff --git a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql index f188a9cd166..8b38c66cb4e 100644 --- a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql +++ b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql @@ -1,2 +1,2 @@ -CREATE EXTENSION pg_trgm; +CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE INDEX index_crates_name_tgrm ON crates USING gin (canon_crate_name(name) gin_trgm_ops); diff --git a/src/tasks.rs b/src/tasks.rs index d4f2f3e3203..ed9e0e91449 100644 --- a/src/tasks.rs +++ b/src/tasks.rs @@ -1,4 +1,4 @@ -mod dump_db; +pub mod dump_db; mod update_downloads; pub use dump_db::dump_db; diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 2b33ef3c155..d227b029b6d 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -27,41 +27,24 @@ pub fn dump_db( /// /// Create the directory, populate it with the psql scripts and CSV dumps, and /// make sure it gets deleted again even in the case of an error. -struct DumpDirectory { - export_dir: PathBuf, +#[derive(Debug)] +pub struct DumpDirectory { + pub export_dir: PathBuf, } impl DumpDirectory { - fn create() -> Result { + pub fn create() -> Result { let timestamp = chrono::Utc::now().format("%Y-%m-%d-%H%M%S").to_string(); let export_dir = std::env::temp_dir().join("dump-db").join(timestamp); std::fs::create_dir_all(&export_dir)?; Ok(Self { export_dir }) } - fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { let export_script = self.export_dir.join("export.sql"); let import_script = self.export_dir.join("import.sql"); gen_scripts::gen_scripts(&export_script, &import_script)?; - let psql_script = File::open(&export_script)?; - let psql = std::process::Command::new("psql") - .arg(database_url) - .current_dir(export_script.parent().unwrap()) - .stdin(psql_script) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::piped()) - .spawn()?; - let output = psql.wait_with_output()?; - if !output.stderr.is_empty() { - Err(format!( - "Error while executing psql: {}", - String::from_utf8_lossy(&output.stderr) - ))?; - } - if !output.status.success() { - Err("psql did not finish successfully.")?; - } - Ok(()) + run_psql(&export_script, database_url) } } @@ -71,6 +54,26 @@ impl Drop for DumpDirectory { } } +pub fn run_psql(script: &Path, database_url: &str) -> Result<(), PerformError> { + let psql_script = File::open(&script)?; + let psql = std::process::Command::new("psql") + .arg(database_url) + .current_dir(script.parent().unwrap()) + .stdin(psql_script) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + let output = psql.wait_with_output()?; + let stderr = String::from_utf8_lossy(&output.stderr); + if stderr.contains("ERROR") { + Err(format!("Error while executing psql: {}", stderr))?; + } + if !output.status.success() { + Err("psql did not finish successfully.")?; + } + Ok(()) +} + /// Manage the tarball of the database dump. /// /// Create the tarball, upload it to S3, and make sure it gets deleted. diff --git a/src/tests/all.rs b/src/tests/all.rs index 310cdf5db9c..af9108f48bd 100644 --- a/src/tests/all.rs +++ b/src/tests/all.rs @@ -52,6 +52,7 @@ mod badge; mod builders; mod categories; mod category; +mod dump_db; mod git; mod keyword; mod krate; diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs new file mode 100644 index 00000000000..7f9444062b7 --- /dev/null +++ b/src/tests/dump_db.rs @@ -0,0 +1,62 @@ +use cargo_registry::tasks::dump_db; +use diesel::{ + connection::{Connection, SimpleConnection}, + pg::PgConnection, +}; + +#[test] +fn dump_db_and_reimport_dump() { + let database_url = crate::env("TEST_DATABASE_URL"); + + // TODO prefill database with some data + + let directory = dump_db::DumpDirectory::create().unwrap(); + directory.dump_db(&database_url).unwrap(); + + let import_script = directory.export_dir.join("import.sql"); + let schema = TemporarySchema::create(database_url, "test_db_dump"); + diesel_migrations::run_pending_migrations(&schema.connection).unwrap(); + dump_db::run_psql(&import_script, &schema.database_url).unwrap(); +} + +struct TemporarySchema { + pub database_url: String, + pub schema_name: String, + pub connection: PgConnection, +} + +impl TemporarySchema { + fn create(database_url: String, schema_name: &str) -> Self { + let params = &[("options", format!("--search_path={}", schema_name))]; + let database_url = url::Url::parse_with_params(&database_url, params) + .unwrap() + .into_string(); + let schema_name = schema_name.to_owned(); + let connection = PgConnection::establish(&database_url).unwrap(); + connection + .batch_execute(&format!( + r#"DROP SCHEMA IF EXISTS "{schema_name}" CASCADE; + CREATE SCHEMA "{schema_name}"; + SET SESSION search_path TO "{schema_name}",public;"#, + schema_name = schema_name, + )) + .unwrap(); + Self { + database_url, + schema_name, + connection, + } + } +} + +impl Drop for TemporarySchema { + fn drop(&mut self) { + self.connection + .batch_execute(&format!( + r#"SET SESSION search_path TO DEFAULT; + DROP SCHEMA {schema_name} CASCADE;"#, + schema_name = self.schema_name, + )) + .unwrap(); + } +} diff --git a/src/tests/load_foreign_key_constraints.sql b/src/tests/load_foreign_key_constraints.sql index a7f320b0447..30a29abdb37 100644 --- a/src/tests/load_foreign_key_constraints.sql +++ b/src/tests/load_foreign_key_constraints.sql @@ -4,5 +4,7 @@ SELECT relname, conname, pg_get_constraintdef(pg_constraint.oid, true) AS defini FROM pg_attribute INNER JOIN pg_class ON pg_class.oid = attrelid LEFT JOIN pg_constraint ON pg_class.oid = conrelid AND ARRAY[attnum] = conkey + INNER JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE attname = $1 AND contype = 'f' + AND nspname = 'public'; From 44a995a0ac16323de17aee3ca794c6e0225946e0 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sun, 8 Sep 2019 20:42:16 +0200 Subject: [PATCH 21/32] Use cloned() instead of copied(). The latter is only available starting from Rust version 1.36, while Travis only seems to offer version 1.35. --- src/tasks/dump_db/gen_scripts.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index ec50dacd8d9..4cc8050bf6e 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -110,7 +110,7 @@ impl VisibilityConfig { .collect(); while let Some(table) = ready.pop_front() { result.push(table); - for dep in rev_deps.get(table).iter().copied().flatten() { + for dep in rev_deps.get(table).iter().cloned().flatten() { *num_deps.get_mut(dep).unwrap() -= 1; if num_deps[dep] == 0 { ready.push_back(dep); From f5f536a5cec695409124a3f70e3af813670ef400 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sun, 8 Sep 2019 21:05:28 +0200 Subject: [PATCH 22/32] Suppress migrations output in dump_db test. --- src/tests/dump_db.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index 7f9444062b7..fc4266efc9b 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -13,10 +13,13 @@ fn dump_db_and_reimport_dump() { let directory = dump_db::DumpDirectory::create().unwrap(); directory.dump_db(&database_url).unwrap(); - let import_script = directory.export_dir.join("import.sql"); let schema = TemporarySchema::create(database_url, "test_db_dump"); - diesel_migrations::run_pending_migrations(&schema.connection).unwrap(); + schema.run_migrations(); + + let import_script = directory.export_dir.join("import.sql"); dump_db::run_psql(&import_script, &schema.database_url).unwrap(); + + // TODO: Consistency checks on the re-imported data? } struct TemporarySchema { @@ -26,8 +29,8 @@ struct TemporarySchema { } impl TemporarySchema { - fn create(database_url: String, schema_name: &str) -> Self { - let params = &[("options", format!("--search_path={}", schema_name))]; + pub fn create(database_url: String, schema_name: &str) -> Self { + let params = &[("options", format!("--search_path={},public", schema_name))]; let database_url = url::Url::parse_with_params(&database_url, params) .unwrap() .into_string(); @@ -36,8 +39,7 @@ impl TemporarySchema { connection .batch_execute(&format!( r#"DROP SCHEMA IF EXISTS "{schema_name}" CASCADE; - CREATE SCHEMA "{schema_name}"; - SET SESSION search_path TO "{schema_name}",public;"#, + CREATE SCHEMA "{schema_name}";"#, schema_name = schema_name, )) .unwrap(); @@ -47,16 +49,23 @@ impl TemporarySchema { connection, } } + + pub fn run_migrations(&self) { + use diesel_migrations::{find_migrations_directory, run_pending_migrations_in_directory}; + let migrations_dir = find_migrations_directory().unwrap(); + run_pending_migrations_in_directory( + &self.connection, + &migrations_dir, + &mut std::io::sink(), + ) + .unwrap(); + } } impl Drop for TemporarySchema { fn drop(&mut self) { self.connection - .batch_execute(&format!( - r#"SET SESSION search_path TO DEFAULT; - DROP SCHEMA {schema_name} CASCADE;"#, - schema_name = self.schema_name, - )) + .batch_execute(&format!(r#"DROP SCHEMA "{}" CASCADE;"#, self.schema_name)) .unwrap(); } } From 200d99f521cf56d6fcc8b62470ac9e521df82fc0 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sun, 8 Sep 2019 21:42:45 +0200 Subject: [PATCH 23/32] Remove crates.licence column from dump_db configuration. --- src/tasks/dump_db/dump-db.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml index 83c305aa15a..f025caeaf44 100644 --- a/src/tasks/dump_db/dump-db.toml +++ b/src/tasks/dump_db/dump-db.toml @@ -78,7 +78,6 @@ homepage = "public" documentation = "public" readme = "public" textsearchable_index_col = "public" -license = "public" repository = "public" max_upload_size = "public" From e760a53a148ebb40e1c97aeecc33c3aae3a4e74c Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 12 Sep 2019 21:47:39 +0200 Subject: [PATCH 24/32] Move data in dump tarbal into subdirectory. --- src/tasks/dump_db.rs | 1 + src/tasks/dump_db/{dump-export.hbs => dump-export.sql.hbs} | 4 ++-- src/tasks/dump_db/{dump-import.hbs => dump-import.sql.hbs} | 2 +- src/tasks/dump_db/gen_scripts.rs | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) rename src/tasks/dump_db/{dump-export.hbs => dump-export.sql.hbs} (75%) rename src/tasks/dump_db/{dump-import.hbs => dump-import.sql.hbs} (91%) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index d227b029b6d..d9984fcf323 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -44,6 +44,7 @@ impl DumpDirectory { let export_script = self.export_dir.join("export.sql"); let import_script = self.export_dir.join("import.sql"); gen_scripts::gen_scripts(&export_script, &import_script)?; + std::fs::create_dir(self.export_dir.join("data"))?; run_psql(&export_script, database_url) } } diff --git a/src/tasks/dump_db/dump-export.hbs b/src/tasks/dump_db/dump-export.sql.hbs similarity index 75% rename from src/tasks/dump_db/dump-export.hbs rename to src/tasks/dump_db/dump-export.sql.hbs index fa8582ad825..67336c285ab 100644 --- a/src/tasks/dump_db/dump-export.hbs +++ b/src/tasks/dump_db/dump-export.sql.hbs @@ -16,9 +16,9 @@ COMMIT; BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; {{~#each tables}} {{~#if this.filter}} - \copy (SELECT * FROM "dump_db_{{this.name}}") TO '{{this.name}}.csv' WITH CSV HEADER + \copy (SELECT * FROM "dump_db_{{this.name}}") TO 'data/{{this.name}}.csv' WITH CSV HEADER {{~else}} - \copy "{{this.name}}" ({{this.columns}}) TO '{{this.name}}.csv' WITH CSV HEADER + \copy "{{this.name}}" ({{this.columns}}) TO 'data/{{this.name}}.csv' WITH CSV HEADER {{~/if}} {{~/each}} COMMIT; diff --git a/src/tasks/dump_db/dump-import.hbs b/src/tasks/dump_db/dump-import.sql.hbs similarity index 91% rename from src/tasks/dump_db/dump-import.hbs rename to src/tasks/dump_db/dump-import.sql.hbs index fabbfeedf54..32dce384d03 100644 --- a/src/tasks/dump_db/dump-import.hbs +++ b/src/tasks/dump_db/dump-import.sql.hbs @@ -29,7 +29,7 @@ BEGIN; -- Import the CSV data. {{~#each tables}} - \copy "{{this.name}}" ({{this.columns}}) FROM '{{this.name}}.csv' WITH CSV HEADER + \copy "{{this.name}}" ({{this.columns}}) FROM 'data/{{this.name}}.csv' WITH CSV HEADER {{~/each}} -- Drop the defaults again. diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index 4cc8050bf6e..e128a0165cd 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -142,12 +142,12 @@ impl VisibilityConfig { let mut handlebars = handlebars::Handlebars::new(); handlebars.register_escape_fn(handlebars::no_escape); handlebars.render_template_to_write( - include_str!("dump-export.hbs"), + include_str!("dump-export.sql.hbs"), &context, export_sql, )?; handlebars.render_template_to_write( - include_str!("dump-import.hbs"), + include_str!("dump-import.sql.hbs"), &context, import_sql, )?; From 3beeb3807eba23c0c9f046b68f54acf6ade2a2ef Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Thu, 12 Sep 2019 23:09:10 +0200 Subject: [PATCH 25/32] Add README and metadata to database dump. --- src/tasks/dump_db.rs | 39 +++++++++++++++++++++++-- src/tasks/dump_db/dump-export.sql.hbs | 3 -- src/tasks/dump_db/dump-import.sql.hbs | 16 ---------- src/tasks/dump_db/readme_for_tarball.md | 30 +++++++++++++++++++ 4 files changed, 66 insertions(+), 22 deletions(-) create mode 100644 src/tasks/dump_db/readme_for_tarball.md diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index d9984fcf323..83d7b0254f7 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -29,24 +29,57 @@ pub fn dump_db( /// make sure it gets deleted again even in the case of an error. #[derive(Debug)] pub struct DumpDirectory { + pub timestamp: chrono::DateTime, pub export_dir: PathBuf, } impl DumpDirectory { pub fn create() -> Result { - let timestamp = chrono::Utc::now().format("%Y-%m-%d-%H%M%S").to_string(); - let export_dir = std::env::temp_dir().join("dump-db").join(timestamp); + let timestamp = chrono::Utc::now(); + let timestamp_str = timestamp.format("%Y-%m-%d-%H%M%S").to_string(); + let export_dir = std::env::temp_dir().join("dump-db").join(timestamp_str); std::fs::create_dir_all(&export_dir)?; - Ok(Self { export_dir }) + Ok(Self { + timestamp, + export_dir, + }) } pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + self.add_readme()?; + self.add_metadata()?; let export_script = self.export_dir.join("export.sql"); let import_script = self.export_dir.join("import.sql"); gen_scripts::gen_scripts(&export_script, &import_script)?; std::fs::create_dir(self.export_dir.join("data"))?; run_psql(&export_script, database_url) } + + fn add_readme(&self) -> Result<(), PerformError> { + use std::io::Write; + + let mut readme = File::create(self.export_dir.join("README.md"))?; + readme.write_all(include_bytes!("dump_db/readme_for_tarball.md"))?; + Ok(()) + } + + fn add_metadata(&self) -> Result<(), PerformError> { + #[derive(Serialize)] + struct Metadata<'a> { + timestamp: &'a chrono::DateTime, + crates_io_commit: String, + format_version: &'static str, + } + let metadata = Metadata { + timestamp: &self.timestamp, + crates_io_commit: dotenv::var("HEROKU_SLUG_COMMIT") + .unwrap_or_else(|_| "unknown".to_owned()), + format_version: "0.1", + }; + let file = File::create(self.export_dir.join("metadata.json"))?; + serde_json::to_writer_pretty(file, &metadata)?; + Ok(()) + } } impl Drop for DumpDirectory { diff --git a/src/tasks/dump_db/dump-export.sql.hbs b/src/tasks/dump_db/dump-export.sql.hbs index 67336c285ab..0fcf38cba4d 100644 --- a/src/tasks/dump_db/dump-export.sql.hbs +++ b/src/tasks/dump_db/dump-export.sql.hbs @@ -1,6 +1,3 @@ --- This script was used to create this database dump, and is only included in --- the archive for reference. - BEGIN; {{~#each tables}} {{~#if this.filter}} diff --git a/src/tasks/dump_db/dump-import.sql.hbs b/src/tasks/dump_db/dump-import.sql.hbs index 32dce384d03..49508e7e35a 100644 --- a/src/tasks/dump_db/dump-import.sql.hbs +++ b/src/tasks/dump_db/dump-import.sql.hbs @@ -1,19 +1,3 @@ --- Script for psql to restore the dump into a local crates.io database. --- --- WARNING: This will destroy the current database contents. --- --- Instructions: --- --- 1. Create a new database and run the Diesel migrations. --- --- createdb DATABASE_NAME --- diesel migration run --database-url DATABASE_URL --- --- 2. Run this script. --- --- cd DUMP_DIRECTORY --- psql DATABASE_URL < import.sql - BEGIN; -- Set defaults for non-nullable columns not included in the dump. {{~#each tables as |table|}} diff --git a/src/tasks/dump_db/readme_for_tarball.md b/src/tasks/dump_db/readme_for_tarball.md new file mode 100644 index 00000000000..45444ac1131 --- /dev/null +++ b/src/tasks/dump_db/readme_for_tarball.md @@ -0,0 +1,30 @@ +# crates.io Database Dump + +This is a dump of the public information in the crates.io database. + +## Files + +* `data/` – the CSV files with the actual dump data. +* `export.sql` – the `psql` script that was used to create this database dump. It is only included in the archive for reference. +* `import.sql` – a `psql` script that can be used to restore the dump into a PostgreSQL database with the same schema as the `crates.io` database, destroying all current data. +* `metadata.json` – some metadata of this dump. + +## Metadata Fields + +* `timestamp` – the UTC time the dump was started. +* `crates_io_commit` – the git commit hash of the deployed version of crates.io that created this dump. +* `format_version` – the version of the layout and format of this dump. Roughly follows SemVer conventions. + +## Restoring to a Local crates.io Database + +WARNING: This will destroy the current database contents. + +1. Create a new database and run the Diesel migrations. + + createdb DATABASE_NAME + diesel migration run --database-url DATABASE_URL + +2. Run this script. + + cd DUMP_DIRECTORY + psql DATABASE_URL < import.sql From d8b45d9f91b7310fe244e902b226d34e87d3ce14 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 14 Sep 2019 21:03:47 +0200 Subject: [PATCH 26/32] Remove redundant dependency in dump-db.toml. --- src/tasks/dump_db/dump-db.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml index f025caeaf44..4e1c0a39537 100644 --- a/src/tasks/dump_db/dump-db.toml +++ b/src/tasks/dump_db/dump-db.toml @@ -173,7 +173,7 @@ gh_id = "public" gh_access_token = "''" [version_authors] -dependencies = ["users", "versions"] +dependencies = ["versions"] [version_authors.columns] id = "public" version_id = "public" From cc8f22e79472b19aec652c3fd8b7e95d9c1899d7 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Sat, 14 Sep 2019 22:28:08 +0200 Subject: [PATCH 27/32] Inlcude schema dump in database dump. --- src/tasks/dump_db.rs | 35 ++++++++++++++++++++----- src/tasks/dump_db/readme_for_tarball.md | 12 ++++----- src/tests/dump_db.rs | 2 +- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 83d7b0254f7..5b85b6d197b 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -16,7 +16,7 @@ pub fn dump_db( target_name: String, ) -> Result<(), PerformError> { let directory = DumpDirectory::create()?; - directory.dump_db(&database_url)?; + directory.populate(&database_url)?; let tarball = DumpTarball::create(&directory.export_dir)?; tarball.upload(&target_name, &env.uploader)?; println!("Database dump uploaded to {}.", &target_name); @@ -45,14 +45,11 @@ impl DumpDirectory { }) } - pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + pub fn populate(&self, database_url: &str) -> Result<(), PerformError> { self.add_readme()?; self.add_metadata()?; - let export_script = self.export_dir.join("export.sql"); - let import_script = self.export_dir.join("import.sql"); - gen_scripts::gen_scripts(&export_script, &import_script)?; - std::fs::create_dir(self.export_dir.join("data"))?; - run_psql(&export_script, database_url) + self.dump_schema(database_url)?; + self.dump_db(database_url) } fn add_readme(&self) -> Result<(), PerformError> { @@ -80,6 +77,30 @@ impl DumpDirectory { serde_json::to_writer_pretty(file, &metadata)?; Ok(()) } + + pub fn dump_schema(&self, database_url: &str) -> Result<(), PerformError> { + let schema_sql = File::create(self.export_dir.join("schema.sql"))?; + let status = std::process::Command::new("pg_dump") + .arg("--schema-only") + .arg("--no-owner") + .arg("--no-acl") + .arg(database_url) + .stdout(schema_sql) + .spawn()? + .wait()?; + if !status.success() { + Err("pg_dump did not finish successfully.")?; + } + Ok(()) + } + + pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + let export_script = self.export_dir.join("export.sql"); + let import_script = self.export_dir.join("import.sql"); + gen_scripts::gen_scripts(&export_script, &import_script)?; + std::fs::create_dir(self.export_dir.join("data"))?; + run_psql(&export_script, database_url) + } } impl Drop for DumpDirectory { diff --git a/src/tasks/dump_db/readme_for_tarball.md b/src/tasks/dump_db/readme_for_tarball.md index 45444ac1131..7989ae85d7f 100644 --- a/src/tasks/dump_db/readme_for_tarball.md +++ b/src/tasks/dump_db/readme_for_tarball.md @@ -17,14 +17,14 @@ This is a dump of the public information in the crates.io database. ## Restoring to a Local crates.io Database -WARNING: This will destroy the current database contents. - -1. Create a new database and run the Diesel migrations. +1. Create a new database. createdb DATABASE_NAME - diesel migration run --database-url DATABASE_URL -2. Run this script. +2. Restore the database schema. + + psql DATABASE_NAME < schema.sql + +3. Run the import script. - cd DUMP_DIRECTORY psql DATABASE_URL < import.sql diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index fc4266efc9b..11187d6b16f 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -11,7 +11,7 @@ fn dump_db_and_reimport_dump() { // TODO prefill database with some data let directory = dump_db::DumpDirectory::create().unwrap(); - directory.dump_db(&database_url).unwrap(); + directory.populate(&database_url).unwrap(); let schema = TemporarySchema::create(database_url, "test_db_dump"); schema.run_migrations(); From 620bb51601500672f1f7ccac8ebfa719df9c62c8 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Mon, 16 Sep 2019 17:15:47 +0200 Subject: [PATCH 28/32] Remove format version from dump metadata. --- src/tasks/dump_db.rs | 2 -- src/tasks/dump_db/readme_for_tarball.md | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index 5b85b6d197b..a7997c59f93 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -65,13 +65,11 @@ impl DumpDirectory { struct Metadata<'a> { timestamp: &'a chrono::DateTime, crates_io_commit: String, - format_version: &'static str, } let metadata = Metadata { timestamp: &self.timestamp, crates_io_commit: dotenv::var("HEROKU_SLUG_COMMIT") .unwrap_or_else(|_| "unknown".to_owned()), - format_version: "0.1", }; let file = File::create(self.export_dir.join("metadata.json"))?; serde_json::to_writer_pretty(file, &metadata)?; diff --git a/src/tasks/dump_db/readme_for_tarball.md b/src/tasks/dump_db/readme_for_tarball.md index 7989ae85d7f..3d9f431d311 100644 --- a/src/tasks/dump_db/readme_for_tarball.md +++ b/src/tasks/dump_db/readme_for_tarball.md @@ -4,16 +4,16 @@ This is a dump of the public information in the crates.io database. ## Files -* `data/` – the CSV files with the actual dump data. +* `data/` – the CSV files with the actual data. * `export.sql` – the `psql` script that was used to create this database dump. It is only included in the archive for reference. * `import.sql` – a `psql` script that can be used to restore the dump into a PostgreSQL database with the same schema as the `crates.io` database, destroying all current data. * `metadata.json` – some metadata of this dump. +* `schema.sql` – a dump of the database schema to facilitate generating a new database from the data. ## Metadata Fields * `timestamp` – the UTC time the dump was started. * `crates_io_commit` – the git commit hash of the deployed version of crates.io that created this dump. -* `format_version` – the version of the layout and format of this dump. Roughly follows SemVer conventions. ## Restoring to a Local crates.io Database From ba3709bf96b0441b80c4d2c7c8324478e37b7f86 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Tue, 17 Sep 2019 13:32:13 +0200 Subject: [PATCH 29/32] Document the database dumps in the frontend. --- app/router.js | 1 + app/templates/data-access.hbs | 34 ++++++++++++++++++++++++++++++++++ app/templates/policies.hbs | 11 ++--------- 3 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 app/templates/data-access.hbs diff --git a/app/router.js b/app/router.js index 969fbfc156b..5a5f85cea47 100644 --- a/app/router.js +++ b/app/router.js @@ -46,6 +46,7 @@ Router.map(function() { this.route('category-slugs', { path: 'category_slugs' }); this.route('team', { path: '/teams/:team_id' }); this.route('policies'); + this.route('data-access'); this.route('confirm', { path: '/confirm/:email_token' }); this.route('catch-all', { path: '*path' }); diff --git a/app/templates/data-access.hbs b/app/templates/data-access.hbs new file mode 100644 index 00000000000..180d81ea8ab --- /dev/null +++ b/app/templates/data-access.hbs @@ -0,0 +1,34 @@ +
+ {{svg-jar 'circle-with-i'}} +

Accessing the Crates.io Data

+
+ +

+ There are several ways of accessing the Crates.io data. You should try the + options in the order listed. +

+ +
    +
  1. + + The crates.ioindex. + + This git repository is always kept up to date by crates.io, and it is used + by Cargo to speed up local dependency resolution. It contains the majority + of the data exposed by crates.io and is cheap to clone and to update. +
  2. +
  3. + The database dumps (experimental). The dumps contain all information + exposed by the API in a single download. They are updated every six hours. + The latest dump is available at the address + https://static.crates.io/db-dump.tar.gz. + Information on using the dump is contained in the tarball. +
  4. +
  5. + Crawl the crates.io API. This should be used as a last resort, and + doing so is subject to our {{#link-to 'policies'}}crawling policy{{/link-to}}. + If the index and the database dumps do not satisfy your needs, we're happy to + discuss solutions to your needs that don't require you to crawl the registry. + You can email us at help@crates.io. +
  6. +
diff --git a/app/templates/policies.hbs b/app/templates/policies.hbs index 3e99e6d441a..3f24f3cc560 100644 --- a/app/templates/policies.hbs +++ b/app/templates/policies.hbs @@ -112,15 +112,8 @@

Crawlers

- Before resorting to crawling crates.io, you should first see if you are able to - gather the information you need from the - crates.io index, - which is a public git repository containing the majority - of the information availble through our API. - - If the index does not have the information you need, we're also happy to - discuss solutions to your needs that don't require you to crawl the registry. - You can email us at help@crates.io. + Before resorting to crawling crates.io, please read + {{#link-to 'data-access'}}Accessing the Crates.io Data{{/link-to}}.

From c1428809f11ca094cce20524c242e1f873c34b38 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Wed, 18 Sep 2019 20:37:05 +0200 Subject: [PATCH 30/32] Add dependency for badges on crates to db dump configuration. --- src/tasks/dump_db/dump-db.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml index 4e1c0a39537..38318037c95 100644 --- a/src/tasks/dump_db/dump-db.toml +++ b/src/tasks/dump_db/dump-db.toml @@ -35,6 +35,8 @@ retries = "private" last_retry = "private" created_at = "private" +[badges] +dependencies = ["crates"] [badges.columns] crate_id = "public" badge_type = "public" From a57fed2d185acffde2d573ce1193c5f392731859 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Sat, 28 Sep 2019 14:47:52 -0400 Subject: [PATCH 31/32] Small edits to the data access documentation --- app/templates/data-access.hbs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/app/templates/data-access.hbs b/app/templates/data-access.hbs index 180d81ea8ab..eb5d5ee0892 100644 --- a/app/templates/data-access.hbs +++ b/app/templates/data-access.hbs @@ -11,15 +11,15 @@

  1. - The crates.ioindex. + The crates.io index. - This git repository is always kept up to date by crates.io, and it is used + This git repository is updated by crates.io, and it is used by Cargo to speed up local dependency resolution. It contains the majority - of the data exposed by crates.io and is cheap to clone and to update. + of the data exposed by crates.io and is cheap to clone and get updates.
  2. - The database dumps (experimental). The dumps contain all information - exposed by the API in a single download. They are updated every six hours. + The database dumps (experimental). The dump contains all information + exposed by the API in a single download. It is updated every 24 hours. The latest dump is available at the address https://static.crates.io/db-dump.tar.gz. Information on using the dump is contained in the tarball. @@ -28,7 +28,7 @@ Crawl the crates.io API. This should be used as a last resort, and doing so is subject to our {{#link-to 'policies'}}crawling policy{{/link-to}}. If the index and the database dumps do not satisfy your needs, we're happy to - discuss solutions to your needs that don't require you to crawl the registry. + discuss solutions that don't require you to crawl the registry. You can email us at help@crates.io.
From 01a4e98d23db4222e5f3ac7f47e74a9387cd0742 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Sat, 28 Sep 2019 19:32:46 -0400 Subject: [PATCH 32/32] Take new clippy suggestions that came with 1.38 --- src/tasks/dump_db.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs index a7997c59f93..b6c36a3a966 100644 --- a/src/tasks/dump_db.rs +++ b/src/tasks/dump_db.rs @@ -87,7 +87,7 @@ impl DumpDirectory { .spawn()? .wait()?; if !status.success() { - Err("pg_dump did not finish successfully.")?; + return Err("pg_dump did not finish successfully.".into()); } Ok(()) } @@ -119,10 +119,10 @@ pub fn run_psql(script: &Path, database_url: &str) -> Result<(), PerformError> { let output = psql.wait_with_output()?; let stderr = String::from_utf8_lossy(&output.stderr); if stderr.contains("ERROR") { - Err(format!("Error while executing psql: {}", stderr))?; + return Err(format!("Error while executing psql: {}", stderr).into()); } if !output.status.success() { - Err("psql did not finish successfully.")?; + return Err("psql did not finish successfully.".into()); } Ok(()) }