diff --git a/Cargo.toml b/Cargo.toml index 5930c96168e..0b87904e40c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,12 +44,12 @@ max = ["max-control", "fast", "gitoxide-core-blocking-client", "http-client-curl max-pure = ["max-control", "gix-features/rustsha1", "gix-features/zlib-rust-backend", "http-client-reqwest", "gitoxide-core-blocking-client" ] ## Like `max`, but with more control for configuration. See the *Package Maintainers* headline for more information. -max-control = ["fast-safe", "pretty-cli", "gitoxide-core-tools-query", "gitoxide-core-tools", "prodash-render-line", "prodash-render-tui", "prodash/render-line-autoconfigure", "gix/regex" ] +max-control = ["fast-safe", "pretty-cli", "gitoxide-core-tools-query", "gitoxide-core-tools-corpus", "gitoxide-core-tools", "prodash-render-line", "prodash-render-tui", "prodash/render-line-autoconfigure", "gix/regex" ] ## All of the good stuff, with less fanciness for smaller binaries. ## ## As fast as possible, progress line rendering, all transports based on their most mature implementation (HTTP), all `ein` tools, CLI colors and local-time support, JSON output. -lean = ["fast", "pretty-cli", "http-client-curl", "gitoxide-core-tools-query", "gitoxide-core-tools", "gitoxide-core-blocking-client", "prodash-render-line" ] +lean = ["fast", "pretty-cli", "http-client-curl", "gitoxide-core-tools-query", "gitoxide-core-tools-corpus", "gitoxide-core-tools", "gitoxide-core-blocking-client", "prodash-render-line" ] ## The smallest possible build, best suitable for small single-core machines. ## @@ -67,7 +67,7 @@ small = ["pretty-cli", "gix-features/rustsha1", "gix-features/zlib-rust-backend" ## ## Due to async client-networking not being implemented for most transports, this one supports only the 'git+tcp' and HTTP transport. ## It uses, however, a fully asynchronous networking implementation which can serve a real-world example on how to implement custom async transports. -lean-async = ["fast", "pretty-cli", "gitoxide-core-tools", "gitoxide-core-tools-query", "gitoxide-core-async-client", "prodash-render-line"] +lean-async = ["fast", "pretty-cli", "gitoxide-core-tools", "gitoxide-core-tools-query", "gitoxide-core-tools-corpus", "gitoxide-core-async-client", "prodash-render-line"] #! ### Package Maintainers #! `*-control` features leave it to you to configure C libraries, involving choices for `zlib`, ! hashing and transport implementation. @@ -128,7 +128,10 @@ cache-efficiency-debug = ["gix-features/cache-efficiency-debug"] gitoxide-core-tools = ["gitoxide-core/organize", "gitoxide-core/estimate-hours"] ## A program to perform analytics on a `git` repository, using an auto-maintained sqlite database -gitoxide-core-tools-query = ["gitoxide-core-tools", "gitoxide-core/query"] +gitoxide-core-tools-query = ["gitoxide-core/query"] + +## A program to run algorithms on a corpus of repositories, recording each run for later comparison. +gitoxide-core-tools-corpus = ["gitoxide-core/corpus"] #! ### Building Blocks for mutually exclusive networking #! Blocking and async features are mutually exclusive and cause a compile-time error. This also means that `cargo … --all-features` will fail. diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index 3dc0dd5ed63..00d8ce20eb9 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -21,6 +21,8 @@ organize = ["dep:gix-url", "dep:jwalk"] estimate-hours = ["dep:itertools", "dep:fs-err", "dep:crossbeam-channel", "dep:smallvec"] ## Gather information about repositories and store it in a database for easy querying. query = ["dep:rusqlite"] +## Run algorithms on a corpus of repositories and store their results for later comparison and intelligence gathering. +corpus = ["dep:rusqlite"] #! ### Mutually Exclusive Networking #! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used. @@ -66,7 +68,7 @@ fs-err = { version = "2.6.0", optional = true } crossbeam-channel = { version = "0.5.6", optional = true } smallvec = { version = "1.10.0", optional = true } -# for 'query' +# for 'query' and 'corpus' rusqlite = { version = "0.29.0", optional = true, features = ["bundled"] } # for svg graph output diff --git a/gitoxide-core/src/corpus/mod.rs b/gitoxide-core/src/corpus/mod.rs new file mode 100644 index 00000000000..cdf53789e42 --- /dev/null +++ b/gitoxide-core/src/corpus/mod.rs @@ -0,0 +1,93 @@ +pub struct Engine

{ + progress: P, + con: rusqlite::Connection, +} + +pub mod engine { + use crate::corpus::Engine; + use anyhow::Context; + use std::path::PathBuf; + + impl

Engine

+ where + P: gix::Progress, + { + /// Open the corpus DB or create it. + pub fn open_or_create(db: PathBuf, progress: P) -> anyhow::Result> { + let con = crate::corpus::db::create(db).context("Could not open or create database")?; + Ok(Engine { progress, con }) + } + + /// Run on the existing set of repositories we have already seen or obtain them from `path` if there is none yet. + pub fn run(&self, _path: PathBuf) -> anyhow::Result<()> { + todo!() + } + } +} + +pub mod db { + use anyhow::bail; + use rusqlite::{params, OptionalExtension}; + + /// A version to be incremented whenever the database layout is changed, to refresh it automatically. + const VERSION: usize = 1; + + pub fn create(path: impl AsRef) -> anyhow::Result { + let path = path.as_ref(); + let con = rusqlite::Connection::open(path)?; + let meta_table = r#" + CREATE TABLE if not exists meta( + version int + )"#; + con.execute_batch(meta_table)?; + let version: Option = con.query_row("SELECT version FROM meta", [], |r| r.get(0)).optional()?; + match version { + None => { + con.execute("INSERT into meta(version) values(?)", params![VERSION])?; + } + Some(version) if version != VERSION => match con.close() { + Ok(()) => { + bail!("Cannot handle database with version {version}, cannot yet migrate to {VERSION}"); + } + Err((_, err)) => return Err(err.into()), + }, + _ => {} + } + con.execute_batch( + r#" + CREATE TABLE if not exists runner( + hash blob(20) NOT NULL PRIMARY KEY + ) + "#, + )?; + // Files are stored as paths which also have an id for referencing purposes + con.execute_batch( + r#" + CREATE TABLE if not exists repository( + file_id integer NOT NULL PRIMARY KEY, + file_path text UNIQUE + ) + "#, + )?; + con.execute_batch( + r#" + CREATE TABLE if not exists run( + hash blob(20), + file_id text, + has_diff boolean NOT NULL, + lines_added integer NOT NULL, + lines_removed integer NOT NULL, + lines_before integer NOT NULL, + lines_after integer NOT NULL, + mode integer, + source_file_id integer, + FOREIGN KEY (hash) REFERENCES commits (hash), + FOREIGN KEY (file_id) REFERENCES files (file_id), + PRIMARY KEY (hash, file_id) + ) + "#, + )?; + + Ok(con) + } +} diff --git a/gitoxide-core/src/lib.rs b/gitoxide-core/src/lib.rs index ebba4f69b76..58e688ea983 100644 --- a/gitoxide-core/src/lib.rs +++ b/gitoxide-core/src/lib.rs @@ -64,6 +64,8 @@ impl FromStr for OutputFormat { } pub mod commitgraph; +#[cfg(feature = "corpus")] +pub mod corpus; pub mod net; #[cfg(feature = "estimate-hours")] diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index 115253dd178..e329a6bbf05 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -13,6 +13,7 @@ use gitoxide_core as core; use gitoxide_core::pack::verify; use gix::bstr::io::BufReadExt; +use crate::plumbing::options::corpus; use crate::{ plumbing::{ options::{ @@ -128,6 +129,20 @@ pub fn main() -> Result<()> { })?; match cmd { + #[cfg(feature = "gitoxide-core-tools-corpus")] + Subcommands::Corpus(corpus::Platform { db, path, cmd }) => prepare_and_run( + "corpus", + auto_verbose, + progress, + progress_keep_open, + None, + move |progress, _out, _err| { + let engine = core::corpus::Engine::open_or_create(db, progress)?; + match cmd { + corpus::SubCommands::Run => engine.run(path), + } + }, + ), Subcommands::CommitGraph(cmd) => match cmd { commitgraph::Subcommands::List { spec } => prepare_and_run( "commitgraph-list", diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index 4a9ce607ae2..a58492d8fe5 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -117,11 +117,40 @@ pub enum Subcommands { /// Show which git configuration values are used or planned. ConfigTree, Config(config::Platform), + #[cfg(feature = "gitoxide-core-tools-corpus")] + Corpus(corpus::Platform), /// Subcommands that need no git repository to run. #[clap(subcommand)] Free(free::Subcommands), } +#[cfg(feature = "gitoxide-core-tools-corpus")] +pub mod corpus { + use std::path::PathBuf; + + #[derive(Debug, clap::Parser)] + #[command( + about = "run algorithms on a corpus of git repositories and store their results for later analysis", + version = clap::crate_version!(), // TODO: make this an actual version that is git describe, leverage `gix` + )] + pub struct Platform { + /// The path to the database to read and write depending on the sub-command. + #[arg(long, default_value = "corpus.db")] + pub db: PathBuf, + /// The path to the root of the corpus to search repositories in. + #[arg(long, short = 'p', default_value = ".")] + pub path: PathBuf, + #[clap(subcommand)] + pub cmd: SubCommands, + } + + #[derive(Debug, clap::Subcommand)] + pub enum SubCommands { + /// Perform a corpus run on all registered repositories. + Run, + } +} + pub mod config { use gix::bstr::BString;