Skip to content

Commit

Permalink
backfill: basic functionality and tests
Browse files Browse the repository at this point in the history
The default behavior of 'git backfill' is to fetch all missing blobs that
are reachable from HEAD. Document and test this behavior.

The implementation is a very simple use of the path-walk API, initializing
the revision walk at HEAD to start the path-walk from all commits reachable
from HEAD. Ignore the object arrays that correspond to tree entries,
assuming that they are all present already.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
  • Loading branch information
derrickstolee authored and dscho committed Jan 7, 2025
1 parent 5de5395 commit ec146ba
Show file tree
Hide file tree
Showing 5 changed files with 222 additions and 3 deletions.
24 changes: 24 additions & 0 deletions Documentation/git-backfill.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,30 @@ SYNOPSIS
DESCRIPTION
-----------

Blobless partial clones are created using `git clone --filter=blob:none`
and then configure the local repository such that the Git client avoids
downloading blob objects unless they are required for a local operation.
This initially means that the clone and later fetches download reachable
commits and trees but no blobs. Later operations that change the `HEAD`
pointer, such as `git checkout` or `git merge`, may need to download
missing blobs in order to complete their operation.

In the worst cases, commands that compute blob diffs, such as `git blame`,
become very slow as they download the missing blobs in single-blob
requests to satisfy the missing object as the Git command needs it. This
leads to multiple download requests and no ability for the Git server to
provide delta compression across those objects.

The `git backfill` command provides a way for the user to request that
Git downloads the missing blobs (with optional filters) such that the
missing blobs representing historical versions of files can be downloaded
in batches. The `backfill` command attempts to optimize the request by
grouping blobs that appear at the same path, hopefully leading to good
delta compression in the packfile sent by the server.

By default, `git backfill` downloads all blobs reachable from the `HEAD`
commit. This set can be restricted or expanded using various options.

SEE ALSO
--------
linkgit:git-clone[1].
Expand Down
1 change: 1 addition & 0 deletions Documentation/technical/api-path-walk.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,5 @@ Examples

See example usages in:
`t/helper/test-path-walk.c`,
`builtin/backfill.c`,
`builtin/pack-objects.c`
105 changes: 102 additions & 3 deletions builtin/backfill.c
Original file line number Diff line number Diff line change
@@ -1,16 +1,117 @@
#include "builtin.h"
#include "git-compat-util.h"
#include "config.h"
#include "parse-options.h"
#include "repository.h"
#include "commit.h"
#include "hex.h"
#include "tree.h"
#include "tree-walk.h"
#include "object.h"
#include "object-store-ll.h"
#include "oid-array.h"
#include "oidset.h"
#include "promisor-remote.h"
#include "strmap.h"
#include "string-list.h"
#include "revision.h"
#include "trace2.h"
#include "progress.h"
#include "packfile.h"
#include "path-walk.h"

static const char * const builtin_backfill_usage[] = {
N_("git backfill [<options>]"),
NULL
};

struct backfill_context {
struct repository *repo;
struct oid_array current_batch;
size_t batch_size;
};

static void clear_backfill_context(struct backfill_context *ctx)
{
oid_array_clear(&ctx->current_batch);
}

static void download_batch(struct backfill_context *ctx)
{
promisor_remote_get_direct(ctx->repo,
ctx->current_batch.oid,
ctx->current_batch.nr);
oid_array_clear(&ctx->current_batch);

/*
* We likely have a new packfile. Add it to the packed list to
* avoid possible duplicate downloads of the same objects.
*/
reprepare_packed_git(ctx->repo);
}

static int fill_missing_blobs(const char *path UNUSED,
struct oid_array *list,
enum object_type type,
void *data)
{
struct backfill_context *ctx = data;

if (type != OBJ_BLOB)
return 0;

for (size_t i = 0; i < list->nr; i++) {
off_t size = 0;
struct object_info info = OBJECT_INFO_INIT;
info.disk_sizep = &size;
if (oid_object_info_extended(ctx->repo,
&list->oid[i],
&info,
OBJECT_INFO_FOR_PREFETCH) ||
!size)
oid_array_append(&ctx->current_batch, &list->oid[i]);
}

if (ctx->current_batch.nr >= ctx->batch_size)
download_batch(ctx);

return 0;
}

static int do_backfill(struct backfill_context *ctx)
{
struct rev_info revs;
struct path_walk_info info = PATH_WALK_INFO_INIT;
int ret;

repo_init_revisions(ctx->repo, &revs, "");
handle_revision_arg("HEAD", &revs, 0, 0);

info.blobs = 1;
info.tags = info.commits = info.trees = 0;

info.revs = &revs;
info.path_fn = fill_missing_blobs;
info.path_fn_data = ctx;

ret = walk_objects_by_path(&info);

/* Download the objects that did not fill a batch. */
if (!ret)
download_batch(ctx);

clear_backfill_context(ctx);
release_revisions(&revs);
return ret;
}

int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
{
struct backfill_context ctx = {
.repo = repo,
.current_batch = OID_ARRAY_INIT,
.batch_size = 50000,
};
struct option options[] = {
OPT_END(),
};
Expand All @@ -23,7 +124,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit

repo_config(repo, git_default_config, NULL);

die(_("not implemented"));

return 0;
return do_backfill(&ctx);
}
1 change: 1 addition & 0 deletions t/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ integration_tests = [
't5617-clone-submodules-remote.sh',
't5618-alternate-refs.sh',
't5619-clone-local-ambiguous-transport.sh',
't5620-backfill.sh',
't5700-protocol-v1.sh',
't5701-git-serve.sh',
't5702-protocol-v2.sh',
Expand Down
94 changes: 94 additions & 0 deletions t/t5620-backfill.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/bin/sh

test_description='git backfill on partial clones'

GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME

. ./test-lib.sh

# We create objects in the 'src' repo.
test_expect_success 'setup repo for object creation' '
echo "{print \$1}" >print_1.awk &&
echo "{print \$2}" >print_2.awk &&
git init src &&
mkdir -p src/a/b/c &&
mkdir -p src/d/e &&
for i in 1 2
do
for n in 1 2 3 4
do
echo "Version $i of file $n" > src/file.$n.txt &&
echo "Version $i of file a/$n" > src/a/file.$n.txt &&
echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
echo "Version $i of file d/$n" > src/d/file.$n.txt &&
echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
git -C src add . &&
git -C src commit -m "Iteration $n" || return 1
done
done
'

# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
# server for the partial clone.
test_expect_success 'setup bare clone for server' '
git clone --bare "file://$(pwd)/src" srv.bare &&
git -C srv.bare config --local uploadpack.allowfilter 1 &&
git -C srv.bare config --local uploadpack.allowanysha1inwant 1
'

# do basic partial clone from "srv.bare"
test_expect_success 'do partial clone 1, backfill gets all objects' '
git clone --no-checkout --filter=blob:none \
--single-branch --branch=main \
"file://$(pwd)/srv.bare" backfill1 &&
# Backfill with no options gets everything reachable from HEAD.
GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
-C backfill1 backfill &&
# We should have engaged the partial clone machinery
test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
# No more missing objects!
git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
test_line_count = 0 revs2
'

. "$TEST_DIRECTORY"/lib-httpd.sh
start_httpd

test_expect_success 'create a partial clone over HTTP' '
SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
rm -rf "$SERVER" repo &&
git clone --bare "file://$(pwd)/src" "$SERVER" &&
test_config -C "$SERVER" uploadpack.allowfilter 1 &&
test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
git clone --no-checkout --filter=blob:none \
"$HTTPD_URL/smart/server" backfill-http
'

test_expect_success 'backfilling over HTTP succeeds' '
GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
-C backfill-http backfill &&
# We should have engaged the partial clone machinery
test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
# Confirm all objects are present, none missing.
git -C backfill-http rev-list --objects --all >rev-list-out &&
awk "{print \$1;}" <rev-list-out >oids &&
GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
cat-file --batch-check <oids >batch-out &&
! grep missing batch-out
'

# DO NOT add non-httpd-specific tests here, because the last part of this
# test script is only executed when httpd is available and enabled.

test_done

0 comments on commit ec146ba

Please sign in to comment.