diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 473a16135abf86..8b4a22319b9d19 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -12,7 +12,8 @@ SYNOPSIS 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied] [--no-reuse-delta] [--delta-base-offset] [--non-empty] [--local] [--incremental] [--window=] [--depth=] - [--revs [--unpacked | --all]] [--stdout | base-name] + [--revs [--unpacked | --all]] + [--stdout [--filter=] | base-name] [--shallow] [--keep-true-parents] < object-list @@ -236,6 +237,11 @@ So does `git bundle` (see linkgit:git-bundle[1]) when it creates a bundle. With this option, parents that are hidden by grafts are packed nevertheless. +--filter=:: + Requires `--stdout`. Omits certain objects (usually blobs) from + the resulting packfile. See linkgit:git-rev-list[1] for valid + `` forms. + SEE ALSO -------- linkgit:git-rev-list[1] diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt index ef22f1775b6348..6d2e60dab34fe3 100644 --- a/Documentation/git-rev-list.txt +++ b/Documentation/git-rev-list.txt @@ -47,7 +47,10 @@ SYNOPSIS [ --fixed-strings | -F ] [ --date=] [ [ --objects | --objects-edge | --objects-edge-aggressive ] - [ --unpacked ] ] + [ --unpacked ] + [ --filter= ] ] + [ --filter-print-missing ] + [ --filter-print-omitted ] [ --pretty | --header ] [ --bisect ] [ --bisect-vars ] diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt index 7d860bfca1442e..88f88788b0cd35 100644 --- a/Documentation/rev-list-options.txt +++ b/Documentation/rev-list-options.txt @@ -706,6 +706,36 @@ ifdef::git-rev-list[] --unpacked:: Only useful with `--objects`; print the object IDs that are not in packs. + +--filter=:: + Only useful with one of the `--objects*`; omits objects (usually + blobs) from the list of printed objects. The '' + may be one of the following: ++ +The form '--filter=blob:none' omits all blobs. ++ +The form '--filter=blob:limit=[kmg]' omits blobs larger than n bytes +or units. The value may be zero. Special files matching '.git*' are +alwayse included, regardless of size. ++ +The form '--filter=sparse:oid=' uses a sparse-checkout +specification contained in the object (or the object that the expression +evaluates to) to omit blobs not required by the corresponding sparse +checkout. ++ +The form '--filter=sparse:path=' similarly uses a sparse-checkout +specification contained in . + +--filter-print-missing:: + Prints a list of the missing objects for the requested traversal. + Object IDs are prefixed with a ``?'' character. The object type + is printed after the ID. This may be used with or without any of + the above filtering options. + +--filter-print-omitted:: + Only useful with one of the above `--filter*`; prints a list + of the omitted objects. Object IDs are prefixed with a ``~'' + character. endif::git-rev-list[] --no-walk[=(sorted|unsorted)]:: diff --git a/Documentation/technical/pack-protocol.txt b/Documentation/technical/pack-protocol.txt index ed1eae8b83a651..68e1400fe26e03 100644 --- a/Documentation/technical/pack-protocol.txt +++ b/Documentation/technical/pack-protocol.txt @@ -212,6 +212,7 @@ out of what the server said it could do with the first 'want' line. upload-request = want-list *shallow-line *1depth-request + [filter-request] flush-pkt want-list = first-want @@ -227,6 +228,8 @@ out of what the server said it could do with the first 'want' line. additional-want = PKT-LINE("want" SP obj-id) depth = 1*DIGIT + + filter-request = PKT-LINE("filter" SP "filter-spec") ---- Clients MUST send all the obj-ids it wants from the reference @@ -249,6 +252,12 @@ complete those commits. Commits whose parents are not received as a result are defined as shallow and marked as such in the server. This information is sent back to the client in the next step. +The client can optionally request that pack-objects omit various +objects from the packfile using one of several filtering techniques. +These are intended for use with partial clone/fetch operations. +The value of "filter-spec" is passed by upload-pack to pack-objects +using the `--filter=` parameter. + Once all the 'want's and 'shallow's (and optional 'deepen') are transferred, clients MUST send a flush-pkt, to tell the server side that it is done sending the list. diff --git a/Documentation/technical/protocol-capabilities.txt b/Documentation/technical/protocol-capabilities.txt index 26dcc6f502020d..2b94679fc9f420 100644 --- a/Documentation/technical/protocol-capabilities.txt +++ b/Documentation/technical/protocol-capabilities.txt @@ -309,3 +309,11 @@ to accept a signed push certificate, and asks the to be included in the push certificate. A send-pack client MUST NOT send a push-cert packet unless the receive-pack server advertises this capability. + +filter-objects +-------------- + +If the upload-pack server advertises the 'filter' capability, +fetch-pack may send a "filter " command to request +a partial clone or fetch where the server omits various objects +from the packfile. diff --git a/Documentation/technical/repository-version.txt b/Documentation/technical/repository-version.txt index 00ad37986efdce..9d488dbbcade4d 100644 --- a/Documentation/technical/repository-version.txt +++ b/Documentation/technical/repository-version.txt @@ -86,3 +86,25 @@ for testing format-1 compatibility. When the config key `extensions.preciousObjects` is set to `true`, objects in the repository MUST NOT be deleted (e.g., by `git-prune` or `git repack -d`). + +`partialcloneremote` +~~~~~~~~~~~~~~~~~~~~ + +When the config key `extensions.partialcloneremote` is set, it indicates +that the repo was created with a partial clone (or later performed +a partial fetch) and that the remote may have omitted sending +certain unwanted objects. Such a remote is called a "promisor remote" +and it promises that all such omitted objects can be fetched from it +in the future. + +The value of this key is the name of the promisor remote. + +`partialclonefilter` +~~~~~~~~~~~~~~~~~~~~ + +When the config key `extensions.partialclonefilter` is set, it gives +the initial filter expression used to create the partial clone. +This value becomed the default filter expression for subsequent +fetches (called "partial fetches") from the promisor remote. This +value may also be set by the first explicit partial fetch following a +normal clone. diff --git a/Makefile b/Makefile index cd75985991f453..38632fb9c4b2e2 100644 --- a/Makefile +++ b/Makefile @@ -807,6 +807,11 @@ LIB_OBJS += levenshtein.o LIB_OBJS += line-log.o LIB_OBJS += line-range.o LIB_OBJS += list-objects.o +LIB_OBJS += list-objects-filter-blobs-limit.o +LIB_OBJS += list-objects-filter-blobs-none.o +LIB_OBJS += list-objects-filter-map.o +LIB_OBJS += list-objects-filter-options.o +LIB_OBJS += list-objects-filter-sparse.o LIB_OBJS += ll-merge.o LIB_OBJS += lockfile.o LIB_OBJS += log-tree.o @@ -836,6 +841,7 @@ LIB_OBJS += pack-write.o LIB_OBJS += pager.o LIB_OBJS += parse-options.o LIB_OBJS += parse-options-cb.o +LIB_OBJS += partial-clone-utils.o LIB_OBJS += patch-delta.o LIB_OBJS += patch-ids.o LIB_OBJS += path.o diff --git a/builtin/clone.c b/builtin/clone.c index dbddd98f80d666..5c392575444572 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -26,6 +26,7 @@ #include "run-command.h" #include "connected.h" #include "packfile.h" +#include "partial-clone-utils.h" /* * Overall FIXMEs: @@ -60,6 +61,7 @@ static struct string_list option_optional_reference = STRING_LIST_INIT_NODUP; static int option_dissociate; static int max_jobs = -1; static struct string_list option_recurse_submodules = STRING_LIST_INIT_NODUP; +static struct list_objects_filter_options filter_options; static int recurse_submodules_cb(const struct option *opt, const char *arg, int unset) @@ -135,6 +137,7 @@ static struct option builtin_clone_options[] = { TRANSPORT_FAMILY_IPV4), OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"), TRANSPORT_FAMILY_IPV6), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END() }; @@ -1073,6 +1076,8 @@ int cmd_clone(int argc, const char **argv, const char *prefix) warning(_("--shallow-since is ignored in local clones; use file:// instead.")); if (option_not.nr) warning(_("--shallow-exclude is ignored in local clones; use file:// instead.")); + if (filter_options.choice) + warning(_("Partial clone is ignored in local clones; use file:// instead.")); if (!access(mkpath("%s/shallow", path), F_OK)) { if (option_local > 0) warning(_("source repository is shallow, ignoring --local")); @@ -1104,7 +1109,11 @@ int cmd_clone(int argc, const char **argv, const char *prefix) transport_set_option(transport, TRANS_OPT_UPLOADPACK, option_upload_pack); - if (transport->smart_options && !deepen) + if (filter_options.choice) + transport_set_option(transport, TRANS_OPT_LIST_OBJECTS_FILTER, + filter_options.raw_value); + + if (transport->smart_options && !deepen && !filter_options.choice) transport->smart_options->check_self_contained_and_connected = 1; refs = transport_get_remote_refs(transport); @@ -1164,13 +1173,18 @@ int cmd_clone(int argc, const char **argv, const char *prefix) write_refspec_config(src_ref_prefix, our_head_points_at, remote_head_points_at, &branch_top); + if (filter_options.choice) + partial_clone_utils_register(&filter_options, "origin", + "clone"); + if (is_local) clone_local(path, git_dir); else if (refs && complete_refs_before_fetch) transport_fetch_refs(transport, mapped_refs); update_remote_refs(refs, mapped_refs, remote_head_points_at, - branch_top.buf, reflog_msg.buf, transport, !is_local); + branch_top.buf, reflog_msg.buf, transport, + !is_local && !filter_options.choice); update_head(our_head_points_at, remote_head, reflog_msg.buf); diff --git a/builtin/fetch-pack.c b/builtin/fetch-pack.c index 366b9d13f929b7..0d0d9611b125cb 100644 --- a/builtin/fetch-pack.c +++ b/builtin/fetch-pack.c @@ -143,6 +143,11 @@ int cmd_fetch_pack(int argc, const char **argv, const char *prefix) args.update_shallow = 1; continue; } + if (skip_prefix(arg, ("--" CL_ARG__FILTER "="), &arg)) { + parse_list_objects_filter(&args.filter_options, arg); + continue; + } + usage(fetch_pack_usage); } if (deepen_not.nr) diff --git a/builtin/fetch.c b/builtin/fetch.c index 225c734924f148..a30481184f9e8c 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -18,6 +18,7 @@ #include "argv-array.h" #include "utf8.h" #include "packfile.h" +#include "partial-clone-utils.h" static const char * const builtin_fetch_usage[] = { N_("git fetch [] [ [...]]"), @@ -55,6 +56,7 @@ static int recurse_submodules_default = RECURSE_SUBMODULES_ON_DEMAND; static int shown_url = 0; static int refmap_alloc, refmap_nr; static const char **refmap_array; +static struct list_objects_filter_options filter_options; static int git_fetch_config(const char *k, const char *v, void *cb) { @@ -160,6 +162,7 @@ static struct option builtin_fetch_options[] = { TRANSPORT_FAMILY_IPV4), OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"), TRANSPORT_FAMILY_IPV6), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END() }; @@ -754,6 +757,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, const char *filename = dry_run ? "/dev/null" : git_path_fetch_head(); int want_status; int summary_width = transport_summary_width(ref_map); + struct check_connected_options opt = CHECK_CONNECTED_INIT; fp = fopen(filename, "a"); if (!fp) @@ -765,7 +769,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, url = xstrdup("foreign"); rm = ref_map; - if (check_connected(iterate_ref_map, &rm, NULL)) { + if (check_connected(iterate_ref_map, &rm, &opt)) { rc = error(_("%s did not send all necessary objects\n"), url); goto abort; } @@ -1044,6 +1048,9 @@ static struct transport *prepare_transport(struct remote *remote, int deepen) set_option(transport, TRANS_OPT_DEEPEN_RELATIVE, "yes"); if (update_shallow) set_option(transport, TRANS_OPT_UPDATE_SHALLOW, "yes"); + if (filter_options.choice) + set_option(transport, TRANS_OPT_LIST_OBJECTS_FILTER, + filter_options.raw_value); return transport; } @@ -1242,6 +1249,20 @@ static int fetch_multiple(struct string_list *list) int i, result = 0; struct argv_array argv = ARGV_ARRAY_INIT; + if (filter_options.choice) { + /* + * We currently only support partial-fetches + * to the remote used for the partial-clone + * because we only support 1 promisor remote. + * + * Note that the loop below will spawn background + * fetches for each remote and one of them may + * INHERIT partial-fetch settings, so everything + * is consistent. + */ + die(_("partial-fetch is not supported on multiple remotes")); + } + if (!append && !dry_run) { int errcode = truncate_fetch_head(); if (errcode) @@ -1267,6 +1288,45 @@ static int fetch_multiple(struct string_list *list) return result; } +static inline void partial_fetch_one_setup(struct remote *remote) +{ + if (filter_options.choice) { + /* + * A partial-fetch was explicitly requested. + * + * If this is the first partial-* command on + * this repo, we must register the partial + * settings in the repository extension. + * + * If this follows a previous partial-* command + * we must ensure the args are consistent with + * the existing registration (because we don't + * currently support mixing-and-matching). + */ + partial_clone_utils_register(&filter_options, + remote->name, "fetch"); + return; + } + + if (is_partial_clone_registered() && + !strcmp(remote->name, repository_format_partial_clone_remote)) { + /* + * If a partial-* command has already been used on + * this repo and it was to this remote, we should + * inherit the filter settings used previously. + * That is, if clone omitted very large blobs, then + * fetch should too. + * + * Use the cached filter-spec and create the filter + * settings. + */ + parse_list_objects_filter( + &filter_options, + repository_format_partial_clone_filter); + } +} + + static int fetch_one(struct remote *remote, int argc, const char **argv) { static const char **refs = NULL; @@ -1278,6 +1338,9 @@ static int fetch_one(struct remote *remote, int argc, const char **argv) die(_("No remote repository specified. Please, specify either a URL or a\n" "remote name from which new revisions should be fetched.")); + partial_fetch_one_setup(remote); + + gtransport = prepare_transport(remote, 1); if (prune < 0) { @@ -1322,7 +1385,7 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) { int i; struct string_list list = STRING_LIST_INIT_DUP; - struct remote *remote; + struct remote *remote = NULL; int result = 0; struct argv_array argv_gc_auto = ARGV_ARRAY_INIT; @@ -1367,17 +1430,14 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) else if (argc > 1) die(_("fetch --all does not make sense with refspecs")); (void) for_each_remote(get_one_remote_for_fetch, &list); - result = fetch_multiple(&list); } else if (argc == 0) { /* No arguments -- use default remote */ remote = remote_get(NULL); - result = fetch_one(remote, argc, argv); } else if (multiple) { /* All arguments are assumed to be remotes or groups */ for (i = 0; i < argc; i++) if (!add_remote_or_group(argv[i], &list)) die(_("No such remote or remote group: %s"), argv[i]); - result = fetch_multiple(&list); } else { /* Single remote or group */ (void) add_remote_or_group(argv[0], &list); @@ -1385,14 +1445,19 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) /* More than one remote */ if (argc > 1) die(_("Fetching a group and specifying refspecs does not make sense")); - result = fetch_multiple(&list); } else { /* Zero or one remotes */ remote = remote_get(argv[0]); - result = fetch_one(remote, argc-1, argv+1); + argc--; + argv++; } } + if (remote) + result = fetch_one(remote, argc, argv); + else + result = fetch_multiple(&list); + if (!result && (recurse_submodules != RECURSE_SUBMODULES_OFF)) { struct argv_array options = ARGV_ARRAY_INIT; diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 8ec459f5225228..5930615eed30a7 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -13,6 +13,7 @@ #include "streaming.h" #include "thread-utils.h" #include "packfile.h" +#include "partial-clone-utils.h" static const char index_pack_usage[] = "git index-pack [-v] [-o ] [--keep | --keep=] [--verify] [--strict] ( | --stdin [--fix-thin] [])"; @@ -222,6 +223,17 @@ static unsigned check_object(struct object *obj) if (!(obj->flags & FLAG_CHECKED)) { unsigned long size; int type = sha1_object_info(obj->oid.hash, &size); + + if (type <= 0 && is_partial_clone_registered()) { + /* + * Relax consistency checks to not complain about + * missing objects (because of earlier partial + * clone or fetch). + */ + obj->flags |= FLAG_CHECKED; + return 0; + } + if (type <= 0) die(_("did not receive expected object %s"), oid_to_hex(&obj->oid)); diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 6e77dfd44439f4..a25185063ebb2a 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -79,6 +79,8 @@ static unsigned long cache_max_small_delta_size = 1000; static unsigned long window_memory_limit = 0; +static struct list_objects_filter_options filter_options; + /* * stats */ @@ -2816,7 +2818,12 @@ static void get_object_list(int ac, const char **av) if (prepare_revision_walk(&revs)) die("revision walk setup failed"); mark_edges_uninteresting(&revs, show_edge); - traverse_commit_list(&revs, show_commit, show_object, NULL); + if (filter_options.choice) + traverse_commit_list_filtered(&filter_options, &revs, + show_commit, show_object, + NULL, NULL); + else + traverse_commit_list(&revs, show_commit, show_object, NULL); if (unpack_unreachable_expiration) { revs.ignore_missing_links = 1; @@ -2952,6 +2959,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use a bitmap index if available to speed up counting objects")), OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index, N_("write a bitmap index together with the pack index")), + + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), + OPT_END(), }; @@ -3028,6 +3038,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (!rev_list_all || !rev_list_reflog || !rev_list_index) unpack_unreachable_expiration = 0; + if (filter_options.choice) { + if (!pack_to_stdout) + die("cannot use filtering with an indexable pack."); + use_bitmap_index = 0; + } + /* * "soft" reasons not to use bitmaps - for on-disk repack by default we want * diff --git a/builtin/rev-list.c b/builtin/rev-list.c index c1c74d4a795643..eeb999bd469f22 100644 --- a/builtin/rev-list.c +++ b/builtin/rev-list.c @@ -12,6 +12,7 @@ #include "bisect.h" #include "progress.h" #include "reflog-walk.h" +#include "partial-clone-utils.h" static const char rev_list_usage[] = "git rev-list [OPTION] ... [ -- paths... ]\n" @@ -54,6 +55,11 @@ static const char rev_list_usage[] = static struct progress *progress; static unsigned progress_counter; +static struct list_objects_filter_options filter_options; +static struct list_objects_filter_map missing_objects; +static int arg_print_missing; +static int arg_print_omitted; +#define DEFAULT_MAP_SIZE (16*1024) static void finish_commit(struct commit *commit, void *data); static void show_commit(struct commit *commit, void *data) @@ -181,8 +187,26 @@ static void finish_commit(struct commit *commit, void *data) static void finish_object(struct object *obj, const char *name, void *cb_data) { struct rev_list_info *info = cb_data; - if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) + if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) { + if (arg_print_missing) { + list_objects_filter_map_insert( + &missing_objects, &obj->oid, name, obj->type); + return; + } + + /* + * Relax consistency checks when we expect missing + * objects because of partial-clone or a previous + * partial-fetch. + * + * Note that this is independent of any filtering that + * we are doing in this run. + */ + if (is_partial_clone_registered()) + return; + die("missing blob object '%s'", oid_to_hex(&obj->oid)); + } if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT) parse_object(&obj->oid); } @@ -202,6 +226,22 @@ static void show_edge(struct commit *commit) printf("-%s\n", oid_to_hex(&commit->object.oid)); } +static void print_omitted_object(int i, int i_limit, struct list_objects_filter_map_entry *e, void *cb_data) +{ + /* struct rev_list_info *info = cb_data; */ + const char *tn = typename(e->type); + + printf("~%s %s\n", oid_to_hex(&e->entry.oid), tn); +} + +static void print_missing_object(int i, int i_limit, struct list_objects_filter_map_entry *e, void *cb_data) +{ + /* struct rev_list_info *info = cb_data; */ + const char *tn = typename(e->type); + + printf("?%s %s\n", oid_to_hex(&e->entry.oid), tn); +} + static void print_var_str(const char *var, const char *val) { printf("%s='%s'\n", var, val); @@ -335,6 +375,26 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) show_progress = arg; continue; } + + if (skip_prefix(arg, ("--" CL_ARG__FILTER "="), &arg)) { + parse_list_objects_filter(&filter_options, arg); + if (filter_options.choice && !revs.blob_objects) + die(_("object filtering requires --objects")); + if (filter_options.choice == LOFC_SPARSE_OID && + !filter_options.sparse_oid_value) + die(_("invalid sparse value '%s'"), + filter_options.raw_value); + continue; + } + if (!strcmp(arg, "--filter-print-missing")) { + arg_print_missing = 1; + continue; + } + if (!strcmp(arg, "--filter-print-omitted")) { + arg_print_omitted = 1; + continue; + } + usage(rev_list_usage); } @@ -360,6 +420,9 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) if (revs.show_notes) die(_("rev-list does not support display of notes")); + if (filter_options.choice && use_bitmap_index) + die(_("cannot combine --use-bitmap-index with object filtering")); + save_commit_buffer = (revs.verbose_header || revs.grep_filter.pattern_list || revs.grep_filter.header_list); @@ -404,7 +467,25 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) return show_bisect_vars(&info, reaches, all); } - traverse_commit_list(&revs, show_commit, show_object, &info); + if (arg_print_missing) { + memset(&missing_objects, 0, sizeof(missing_objects)); + list_objects_filter_map_init(&missing_objects, + DEFAULT_MAP_SIZE); + } + + if (filter_options.choice) + traverse_commit_list_filtered(&filter_options, &revs, + show_commit, show_object, + (arg_print_omitted ? print_omitted_object : NULL), + &info); + else + traverse_commit_list(&revs, show_commit, show_object, &info); + + if (arg_print_missing) { + list_objects_filter_map_foreach(&missing_objects, + print_missing_object, &info); + list_objects_filter_map_clear(&missing_objects); + } stop_progress(&progress); diff --git a/cache.h b/cache.h index 6440e2bf21f580..4b785c030ec40c 100644 --- a/cache.h +++ b/cache.h @@ -860,12 +860,16 @@ extern int grafts_replace_parents; #define GIT_REPO_VERSION 0 #define GIT_REPO_VERSION_READ 1 extern int repository_format_precious_objects; +extern char *repository_format_partial_clone_remote; +extern char *repository_format_partial_clone_filter; struct repository_format { int version; int precious_objects; int is_bare; char *work_tree; + char *partial_clone_remote; /* value of extensions.partialcloneremote */ + char *partial_clone_filter; /* value of extensions.partialclonefilter */ struct string_list unknown_extensions; }; diff --git a/config.h b/config.h index a49d2644162250..90544ef46c39a2 100644 --- a/config.h +++ b/config.h @@ -34,6 +34,9 @@ struct config_options { const char *git_dir; }; +#define KEY_PARTIALCLONEREMOTE "partialcloneremote" +#define KEY_PARTIALCLONEFILTER "partialclonefilter" + typedef int (*config_fn_t)(const char *, const char *, void *); extern int git_default_config(const char *, const char *, void *); extern int git_config_from_file(config_fn_t fn, const char *, void *); diff --git a/dir.c b/dir.c index 1d17b800cf374d..d848f2bfa29e48 100644 --- a/dir.c +++ b/dir.c @@ -739,6 +739,10 @@ static void invalidate_directory(struct untracked_cache *uc, dir->dirs[i]->recurse = 0; } +static int add_excludes_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el); + /* * Given a file with name "fname", read it (either from disk, or from * an index if 'istate' is non-null), parse it and store the @@ -754,9 +758,9 @@ static int add_excludes(const char *fname, const char *base, int baselen, struct sha1_stat *sha1_stat) { struct stat st; - int fd, i, lineno = 1; + int fd; size_t size = 0; - char *buf, *entry; + char *buf; fd = open(fname, O_RDONLY); if (fd < 0 || fstat(fd, &st) < 0) { @@ -813,6 +817,17 @@ static int add_excludes(const char *fname, const char *base, int baselen, } } + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + +static int add_excludes_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el) +{ + int i, lineno = 1; + char *entry; + el->filebuf = buf; if (skip_utf8_bom(&buf, size)) @@ -841,6 +856,38 @@ int add_excludes_from_file_to_list(const char *fname, const char *base, return add_excludes(fname, base, baselen, el, istate, NULL); } +int add_excludes_from_blob_to_list( + struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el) +{ + char *buf; + unsigned long size; + enum object_type type; + + buf = read_sha1_file(oid->hash, &type, &size); + if (!buf) + return -1; + + if (type != OBJ_BLOB) { + free(buf); + return -1; + } + + if (size == 0) { + free(buf); + return 0; + } + + if (buf[size - 1] != '\n') { + buf = xrealloc(buf, st_add(size, 1)); + buf[size++] = '\n'; + } + + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + struct exclude_list *add_exclude_list(struct dir_struct *dir, int group_type, const char *src) { diff --git a/dir.h b/dir.h index e3717055d19336..1bcf39123ad7fd 100644 --- a/dir.h +++ b/dir.h @@ -256,6 +256,9 @@ extern struct exclude_list *add_exclude_list(struct dir_struct *dir, extern int add_excludes_from_file_to_list(const char *fname, const char *base, int baselen, struct exclude_list *el, struct index_state *istate); extern void add_excludes_from_file(struct dir_struct *, const char *fname); +extern int add_excludes_from_blob_to_list(struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el); extern void parse_exclude_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen); extern void add_exclude(const char *string, const char *base, int baselen, struct exclude_list *el, int srcpos); diff --git a/environment.c b/environment.c index 8289c25b44d74a..2fcf9bb3d4d90e 100644 --- a/environment.c +++ b/environment.c @@ -27,6 +27,8 @@ int warn_ambiguous_refs = 1; int warn_on_object_refname_ambiguity = 1; int ref_paranoia = -1; int repository_format_precious_objects; +char *repository_format_partial_clone_remote; +char *repository_format_partial_clone_filter; const char *git_commit_encoding; const char *git_log_output_encoding; const char *apply_default_whitespace; diff --git a/fetch-pack.c b/fetch-pack.c index 008b25d3db0872..d76f08c55f760d 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -377,6 +377,8 @@ static int find_common(struct fetch_pack_args *args, if (prefer_ofs_delta) strbuf_addstr(&c, " ofs-delta"); if (deepen_since_ok) strbuf_addstr(&c, " deepen-since"); if (deepen_not_ok) strbuf_addstr(&c, " deepen-not"); + if (args->filter_options.choice) + strbuf_addstr(&c, (" " CL_ARG__FILTER)); if (agent_supported) strbuf_addf(&c, " agent=%s", git_user_agent_sanitized()); packet_buf_write(&req_buf, "want %s%s\n", remote_hex, c.buf); @@ -407,6 +409,14 @@ static int find_common(struct fetch_pack_args *args, packet_buf_write(&req_buf, "deepen-not %s", s->string); } } + + /* + * TODO Do we need to quote raw_value? + */ + if (args->filter_options.choice) + packet_buf_write(&req_buf, (CL_ARG__FILTER " %s"), + args->filter_options.raw_value); + packet_buf_flush(&req_buf); state_len = req_buf.len; @@ -850,6 +860,7 @@ static int get_pack(struct fetch_pack_args *args, "--keep=fetch-pack %"PRIuMAX " on %s", (uintmax_t)getpid(), hostname); } + if (args->check_self_contained_and_connected) argv_array_push(&cmd.args, "--check-self-contained-and-connected"); } @@ -963,6 +974,11 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, else prefer_ofs_delta = 0; + if (server_supports(CL_ARG__FILTER)) + print_verbose(args, _("Server supports " CL_ARG__FILTER)); + else if (args->filter_options.choice) + die("Server does not support %s", CL_ARG__FILTER); + if ((agent_feature = server_feature_value("agent", &agent_len))) { agent_supported = 1; if (agent_len) diff --git a/fetch-pack.h b/fetch-pack.h index b6aeb43a8e2143..72690653489eac 100644 --- a/fetch-pack.h +++ b/fetch-pack.h @@ -3,6 +3,7 @@ #include "string-list.h" #include "run-command.h" +#include "list-objects-filter-options.h" struct oid_array; @@ -12,6 +13,7 @@ struct fetch_pack_args { int depth; const char *deepen_since; const struct string_list *deepen_not; + struct list_objects_filter_options filter_options; unsigned deepen_relative:1; unsigned quiet:1; unsigned keep_pack:1; diff --git a/list-objects-filter-blobs-limit.c b/list-objects-filter-blobs-limit.c new file mode 100644 index 00000000000000..2c3f8f1c000eba --- /dev/null +++ b/list-objects-filter-blobs-limit.c @@ -0,0 +1,147 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-blobs-limit.h" + +#define DEFAULT_MAP_SIZE (16*1024) + +/* + * A filter for list-objects to omit large blobs, + * but always include ".git*" special files. + * And to OPTIONALLY collect a list of the omitted OIDs. + */ +struct filter_blobs_limit_data { + struct list_objects_filter_map *omits; + unsigned long max_bytes; +}; + +static list_objects_filter_result filter_blobs_limit( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_blobs_limit_data *filter_data = filter_data_; + unsigned long object_length; + enum object_type t; + int is_special_filename; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + is_special_filename = ((strncmp(filename, ".git", 4) == 0) && + filename[4]); + + /* + * If we are keeping a list of the omitted objects + * for the caller *AND* we previously "provisionally" + * omitted this object (because of size) *AND* it now + * has a special filename, make it not-omitted. + * Otherwise, continue to provisionally omit it. + */ + if (filter_data->omits && + list_objects_filter_map_contains(filter_data->omits, + &obj->oid)) { + if (!is_special_filename) + return LOFR_ZERO; + + list_objects_filter_map_remove(filter_data->omits, + &obj->oid); + return LOFR_MARK_SEEN | LOFR_SHOW; + } + + /* + * If filename matches ".git*", always include it (regardless + * of size). (This may include blobs that we do not have + * locally.) + */ + if (is_special_filename) + return LOFR_MARK_SEEN | LOFR_SHOW; + + t = sha1_object_info(obj->oid.hash, &object_length); + if (t != OBJ_BLOB) { /* probably OBJ_NONE */ + /* + * We DO NOT have the blob locally, so we cannot + * apply the size filter criteria. Be conservative + * and force show it (and let the caller deal with + * the ambiguity). (This matches the behavior above + * when the special filename matches.) + */ + return LOFR_MARK_SEEN | LOFR_SHOW; + } + + if (object_length < filter_data->max_bytes) + return LOFR_MARK_SEEN | LOFR_SHOW; + + /* + * Provisionally omit it. We've already established + * that this blob is too big and doesn't have a special + * filename, so we *WANT* to omit it. However, there + * may be a special file elsewhere in the tree that + * references this same blob, so we cannot reject it + * just yet. Leave the LOFR_ bits unset so that *IF* + * the blob appears again in the traversal, we will + * be asked again. + * + * If we are keeping a list of the ommitted objects, + * provisionally add it to the list. + */ + + if (filter_data->omits) + list_objects_filter_map_insert(filter_data->omits, + &obj->oid, pathname, + obj->type); + + return LOFR_ZERO; + } +} + +void traverse_commit_list__blobs_limit( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + unsigned long large_byte_limit) +{ + struct filter_blobs_limit_data d; + + memset(&d, 0, sizeof(d)); + if (print_omitted_object) { + d.omits = xcalloc(1, sizeof(*d.omits)); + list_objects_filter_map_init(d.omits, DEFAULT_MAP_SIZE); + } + d.max_bytes = large_byte_limit; + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_blobs_limit, &d); + + if (print_omitted_object) { + list_objects_filter_map_foreach(d.omits, print_omitted_object, + ctx_data); + list_objects_filter_map_clear(d.omits); + } +} diff --git a/list-objects-filter-blobs-limit.h b/list-objects-filter-blobs-limit.h new file mode 100644 index 00000000000000..ea0508881d7ca0 --- /dev/null +++ b/list-objects-filter-blobs-limit.h @@ -0,0 +1,18 @@ +#ifndef LIST_OBJECTS_FILTER_BLOBS_LIMIT_H +#define LIST_OBJECTS_FILTER_BLOBS_LIMIT_H + +#include "list-objects-filter-map.h" + +/* + * A filter for list-objects to omit large blobs, + * but always include ".git*" special files. + */ +void traverse_commit_list__blobs_limit( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + unsigned long large_byte_limit); + +#endif /* LIST_OBJECTS_FILTER_BLOBS_LIMIT_H */ diff --git a/list-objects-filter-blobs-none.c b/list-objects-filter-blobs-none.c new file mode 100644 index 00000000000000..38c4b25c3d3262 --- /dev/null +++ b/list-objects-filter-blobs-none.c @@ -0,0 +1,83 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-blobs-none.h" + +#define DEFAULT_MAP_SIZE (16*1024) + +/* + * A filter for list-objects to omit ALL blobs from the traversal. + * And to OPTIONALLY collect a list of the omitted OIDs. + */ +struct filter_blobs_none_data { + struct list_objects_filter_map *omits; +}; + +static list_objects_filter_result filter_blobs_none( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_blobs_none_data *filter_data = filter_data_; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + if (filter_data->omits) + list_objects_filter_map_insert( + filter_data->omits, &obj->oid, pathname, + obj->type); + + return LOFR_MARK_SEEN; /* but not LOFR_SHOW (hard omit) */ + } +} + +void traverse_commit_list__blobs_none( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data) +{ + struct filter_blobs_none_data d; + + memset(&d, 0, sizeof(d)); + if (print_omitted_object) { + d.omits = xcalloc(1, sizeof(*d.omits)); + list_objects_filter_map_init(d.omits, DEFAULT_MAP_SIZE); + } + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_blobs_none, &d); + + if (print_omitted_object) { + list_objects_filter_map_foreach(d.omits, + print_omitted_object, + ctx_data); + list_objects_filter_map_clear(d.omits); + } +} diff --git a/list-objects-filter-blobs-none.h b/list-objects-filter-blobs-none.h new file mode 100644 index 00000000000000..363c9de61696c3 --- /dev/null +++ b/list-objects-filter-blobs-none.h @@ -0,0 +1,18 @@ +#ifndef LIST_OBJECTS_FILTER_BLOBS_NONE_H +#define LIST_OBJECTS_FILTER_BLOBS_NONE_H + +#include "list-objects-filter-map.h" + +/* + * A filter for list-objects to omit ALL blobs + * from the traversal. + */ +void traverse_commit_list__blobs_none( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data); + +#endif /* LIST_OBJECTS_FILTER_BLOBS_NONE_H */ + diff --git a/list-objects-filter-map.c b/list-objects-filter-map.c new file mode 100644 index 00000000000000..3a9335a316066b --- /dev/null +++ b/list-objects-filter-map.c @@ -0,0 +1,113 @@ +#include "cache.h" +#include "list-objects-filter-map.h" + +void list_objects_filter_map_init( + struct list_objects_filter_map *map, size_t initial_size) +{ + oidmap_init(&map->map, initial_size); +} + +struct list_objects_filter_map_entry *list_objects_filter_map_get( + const struct list_objects_filter_map *map, + const struct object_id *oid) +{ + struct list_objects_filter_map_entry *e = oidmap_get(&map->map, oid); + + return e; +} + +int list_objects_filter_map_contains(const struct list_objects_filter_map *map, + const struct object_id *oid) +{ + return !!list_objects_filter_map_get(map, oid); +} + +int list_objects_filter_map_insert(struct list_objects_filter_map *map, + const struct object_id *oid, + const char *pathname, enum object_type type) +{ + struct list_objects_filter_map_entry *e; + void *old; + + if (list_objects_filter_map_contains(map, oid)) + return 1; + + e = xcalloc(1, sizeof(*e)); + oidcpy(&e->entry.oid, oid); + if (pathname && *pathname) + e->pathname = strdup(pathname); + e->type = type; + + old = oidmap_put(&map->map, e); + assert(!old); /* since we already confirmed !contained */ + + return 0; +} + +static inline void lofme_free(struct list_objects_filter_map_entry *e) +{ + if (!e) + return; + if (e->pathname) + free(e->pathname); + free(e); +} + +void list_objects_filter_map_remove(struct list_objects_filter_map *map, + const struct object_id *oid) +{ + struct list_objects_filter_map_entry *e; + + e = oidmap_remove(&map->map, oid); + lofme_free(e); +} + +void list_objects_filter_map_clear(struct list_objects_filter_map *map) +{ + struct hashmap_iter iter; + struct list_objects_filter_map_entry *e; + + hashmap_iter_init(&map->map.map, &iter); + while ((e = hashmap_iter_next(&iter))) + lofme_free(e); + + oidmap_free(&map->map, 0); +} + +static int my_cmp(const void *a, const void *b) +{ + const struct oidmap_entry *ea, *eb; + + ea = *(const struct oidmap_entry **)a; + eb = *(const struct oidmap_entry **)b; + + return oidcmp(&ea->oid, &eb->oid); +} + +void list_objects_filter_map_foreach(struct list_objects_filter_map *map, + list_objects_filter_map_foreach_cb cb, + void *cb_data) +{ + struct hashmap_iter iter; + struct list_objects_filter_map_entry **array; + struct list_objects_filter_map_entry *e; + int k, nr; + + nr = hashmap_get_size(&map->map.map); + if (!nr) + return; + + array = xcalloc(nr, sizeof(*e)); + + k = 0; + hashmap_iter_init(&map->map.map, &iter); + while ((e = hashmap_iter_next(&iter))) + array[k++] = e; + + QSORT(array, nr, my_cmp); + + for (k = 0; k < nr; k++) + cb(k, nr, array[k], cb_data); + + free(array); +} diff --git a/list-objects-filter-map.h b/list-objects-filter-map.h new file mode 100644 index 00000000000000..080c0de41737b3 --- /dev/null +++ b/list-objects-filter-map.h @@ -0,0 +1,50 @@ +#ifndef LIST_OBJECTS_FILTER_MAP_H +#define LIST_OBJECTS_FILTER_MAP_H + +#include "oidmap.h" + +struct list_objects_filter_map { + struct oidmap map; +}; + +#define LIST_OBJECTS_FILTER_MAP_INIT { { NULL } } + +struct list_objects_filter_map_entry { + struct oidmap_entry entry; /* must be first */ + + char *pathname; + enum object_type type; +}; + +extern void list_objects_filter_map_init( + struct list_objects_filter_map *map, size_t initial_size); + +extern struct list_objects_filter_map_entry *list_objects_filter_map_get( + const struct list_objects_filter_map *map, + const struct object_id *oid); + +extern int list_objects_filter_map_contains( + const struct list_objects_filter_map *map, + const struct object_id *oid); + +extern int list_objects_filter_map_insert( + struct list_objects_filter_map *map, + const struct object_id *oid, + const char *pathname, enum object_type type); + +extern void list_objects_filter_map_remove( + struct list_objects_filter_map *map, + const struct object_id *oid); + +extern void list_objects_filter_map_clear(struct list_objects_filter_map *map); + +typedef void (*list_objects_filter_map_foreach_cb)( + int i, int i_limit, + struct list_objects_filter_map_entry *e, void *cb_data); + +extern void list_objects_filter_map_foreach( + struct list_objects_filter_map *map, + list_objects_filter_map_foreach_cb cb, + void *cb_data); + +#endif /* LIST_OBJECTS_FILTER_MAP_H */ diff --git a/list-objects-filter-options.c b/list-objects-filter-options.c new file mode 100644 index 00000000000000..40f48ac275cabf --- /dev/null +++ b/list-objects-filter-options.c @@ -0,0 +1,101 @@ +#include "cache.h" +#include "commit.h" +#include "config.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-options.h" + +/* + * Parse value of the argument to the "filter" keword. + * On the command line this looks like: --filter= + * and in the pack protocol as: filter + * + * ::= blob:none + * blob:limit:[kmg] + * sparse:oid: + * sparse:path: + */ +int parse_list_objects_filter(struct list_objects_filter_options *filter_options, + const char *arg) +{ + struct object_context oc; + struct object_id sparse_oid; + const char *v0; + const char *v1; + + if (filter_options->choice) + die(_("multiple object filter types cannot be combined")); + + /* + * TODO consider rejecting 'arg' if it contains any + * TODO injection characters (since we might send this + * TODO to a sub-command or to the server and we don't + * TODO want to deal with legacy quoting/escaping for + * TODO a new feature). + */ + + filter_options->raw_value = strdup(arg); + + if (skip_prefix(arg, "blob:", &v0) || skip_prefix(arg, "blobs:", &v0)) { + if (!strcmp(v0, "none")) { + filter_options->choice = LOFC_BLOB_NONE; + return 0; + } + + if (skip_prefix(v0, "limit=", &v1) && + git_parse_ulong(v1, &filter_options->blob_limit_value)) { + filter_options->choice = LOFC_BLOB_LIMIT; + return 0; + } + } + else if (skip_prefix(arg, "sparse:", &v0)) { + if (skip_prefix(v0, "oid=", &v1)) { + filter_options->choice = LOFC_SPARSE_OID; + if (!get_oid_with_context(v1, GET_OID_BLOB, + &sparse_oid, &oc)) { + /* + * We successfully converted the + * into an actual OID. Rewrite the raw_value + * in canonoical form with just the OID. + * (If we send this request to the server, we + * want an absolute expression rather than a + * local-ref-relative expression.) + */ + free((char *)filter_options->raw_value); + filter_options->raw_value = + xstrfmt("sparse:oid=%s", + oid_to_hex(&sparse_oid)); + filter_options->sparse_oid_value = + oiddup(&sparse_oid); + } else { + /* + * We could not turn the into an + * OID. Leave the raw_value as is in case + * the server can parse it. (It may refer to + * a branch, commit, or blob we don't have.) + */ + } + return 0; + } + + if (skip_prefix(v0, "path=", &v1)) { + filter_options->choice = LOFC_SPARSE_PATH; + filter_options->sparse_path_value = strdup(v1); + return 0; + } + } + + die(_("invalid filter expression '%s'"), arg); + return 0; +} + +int opt_parse_list_objects_filter(const struct option *opt, + const char *arg, int unset) +{ + struct list_objects_filter_options *filter_options = opt->value; + + assert(arg); + assert(!unset); + + return parse_list_objects_filter(filter_options, arg); +} diff --git a/list-objects-filter-options.h b/list-objects-filter-options.h new file mode 100644 index 00000000000000..23bd68ee7aa699 --- /dev/null +++ b/list-objects-filter-options.h @@ -0,0 +1,50 @@ +#ifndef LIST_OBJECTS_FILTER_OPTIONS_H +#define LIST_OBJECTS_FILTER_OPTIONS_H + +#include "parse-options.h" + +/* + * Common declarations and utilities for filtering objects (such as omitting + * large blobs) in list_objects:traverse_commit_list() and git-rev-list. + */ + +enum list_objects_filter_choice { + LOFC_DISABLED = 0, + LOFC_BLOB_NONE, + LOFC_BLOB_LIMIT, + LOFC_SPARSE_OID, + LOFC_SPARSE_PATH, +}; + +struct list_objects_filter_options { + /* + * The raw argument value given on the command line or + * protocol request. (The part after the "--keyword=".) + */ + char *raw_value; + + /* + * Parsed values. Only 1 will be set depending on the flags below. + */ + struct object_id *sparse_oid_value; + char *sparse_path_value; + unsigned long blob_limit_value; + + enum list_objects_filter_choice choice; +}; + +/* Normalized command line arguments */ +#define CL_ARG__FILTER "filter" + +int parse_list_objects_filter(struct list_objects_filter_options *filter_options, + const char *arg); + +int opt_parse_list_objects_filter(const struct option *opt, + const char *arg, int unset); + +#define OPT_PARSE_LIST_OBJECTS_FILTER(fo) \ + { OPTION_CALLBACK, 0, CL_ARG__FILTER, fo, N_("args"), \ + N_("object filtering"), PARSE_OPT_NONEG, \ + opt_parse_list_objects_filter } + +#endif /* LIST_OBJECTS_FILTER_OPTIONS_H */ diff --git a/list-objects-filter-sparse.c b/list-objects-filter-sparse.c new file mode 100644 index 00000000000000..c773d940ed8ef6 --- /dev/null +++ b/list-objects-filter-sparse.c @@ -0,0 +1,239 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-sparse.h" + +#define DEFAULT_MAP_SIZE (16*1024) + +/* + * A filter driven by a sparse-checkout specification to only + * include blobs that a sparse checkout would populate. + * + * The sparse-checkout spec can be loaded from a blob with the + * given OID or from a local pathname. We allow an OID because + * the repo may be bare or we may be doing the filtering on the + * server. + */ +struct frame { + int defval; + int child_prov_omit : 1; +}; + +struct filter_use_sparse_data { + struct list_objects_filter_map *omits; + struct exclude_list el; + + size_t nr, alloc; + struct frame *array_frame; +}; + +static list_objects_filter_result filter_use_sparse( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_use_sparse_data *filter_data = filter_data_; + struct list_objects_filter_map_entry *entry_prev = NULL; + int val, dtype; + struct frame *frame; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + dtype = DT_DIR; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = filter_data->array_frame[filter_data->nr].defval; + + ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1, + filter_data->alloc); + filter_data->nr++; + filter_data->array_frame[filter_data->nr].defval = val; + filter_data->array_frame[filter_data->nr].child_prov_omit = 0; + + /* + * A directory with this tree OID may appear in multiple + * places in the tree. (Think of a directory move, with + * no other changes.) And with a different pathname, the + * is_excluded...() results for this directory and items + * contained within it may be different. So we cannot + * mark it SEEN (yet), since that will prevent process_tree() + * from revisiting this tree object with other pathnames. + * + * Only SHOW the tree object the first time we visit this + * tree object. + * + * We always show all tree objects. A future optimization + * may want to attempt to narrow this. + */ + if (obj->flags & FILTER_REVISIT) + return LOFR_ZERO; + obj->flags |= FILTER_REVISIT; + return LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + assert(filter_data->nr > 0); + + frame = &filter_data->array_frame[filter_data->nr]; + filter_data->nr--; + + /* + * Tell our parent directory if any of our children were + * provisionally omitted. + */ + filter_data->array_frame[filter_data->nr].child_prov_omit |= + frame->child_prov_omit; + + /* + * If there are NO provisionally omitted child objects (ALL child + * objects in this folder were INCLUDED), then we can mark the + * folder as SEEN (so we will not have to revisit it again). + */ + if (!frame->child_prov_omit) + return LOFR_MARK_SEEN; + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + frame = &filter_data->array_frame[filter_data->nr]; + + /* + * If we are keeping a list of the omitted objects + * for the caller *AND* we previsously provisionally + * omitted this object (because the THEN pathname + * is excluded) *AND* it has the same pathname, we + * can avoid duplicating the is_excluded lookup + * costs and continue provisionally omitting it. + */ + if (filter_data->omits) { + entry_prev = list_objects_filter_map_get( + filter_data->omits, &obj->oid); + if (entry_prev && + !strcmp(pathname, entry_prev->pathname)) { + frame->child_prov_omit = 1; + return LOFR_ZERO; + } + } + + dtype = DT_REG; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = frame->defval; + if (val > 0) { + if (entry_prev) + list_objects_filter_map_remove( + filter_data->omits, &obj->oid); + return LOFR_MARK_SEEN | LOFR_SHOW; + } + + /* + * Provisionally omit it. We've already established that + * this pathname is not in the sparse-checkout specification + * with the CURRENT pathname, so we *WANT* to omit this blob. + * + * However, a pathname elsewhere in the tree may also + * reference this same blob, so we cannot reject it yet. + * Leave the LOFR_ bits unset so that if the blob appears + * again in the traversal, we will be asked again. + * + * The pathname that we associate with this omit is just + * the first one we saw for this blob. Other instances of + * this blob may have other pathnames and that is fine. + * We just use it for perf to do the entry_prev lookup + * above (because most of the time, the blob will be in + * the same place as we walk the commits). + */ + if (filter_data->omits) + list_objects_filter_map_insert(filter_data->omits, + &obj->oid, pathname, + obj->type); + + frame->child_prov_omit = 1; + return LOFR_ZERO; + } +} + +static void do_sparse( + struct filter_use_sparse_data *d, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data) +{ + ALLOC_GROW(d->array_frame, d->nr + 1, d->alloc); + d->array_frame[d->nr].defval = 0; /* default to include */ + d->array_frame[d->nr].child_prov_omit = 0; + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_use_sparse, d); + + if (print_omitted_object) { + list_objects_filter_map_foreach(d->omits, print_omitted_object, ctx_data); + list_objects_filter_map_clear(d->omits); + } +} + +void traverse_commit_list__sparse_oid( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + struct object_id *oid) +{ + struct filter_use_sparse_data d; + + memset(&d, 0, sizeof(d)); + if (print_omitted_object) { + d.omits = xcalloc(1, sizeof(*d.omits)); + list_objects_filter_map_init(d.omits, DEFAULT_MAP_SIZE); + } + if (add_excludes_from_blob_to_list(oid, NULL, 0, &d.el) < 0) + die("could not load filter specification"); + + do_sparse(&d, revs, show_commit, show_object, print_omitted_object, + ctx_data); +} + +void traverse_commit_list__sparse_path( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + const char *path) +{ + struct filter_use_sparse_data d; + + memset(&d, 0, sizeof(d)); + if (print_omitted_object) { + d.omits = xcalloc(1, sizeof(*d.omits)); + list_objects_filter_map_init(d.omits, DEFAULT_MAP_SIZE); + } + if (add_excludes_from_file_to_list(path, NULL, 0, &d.el, NULL) < 0) + die("could not load filter specification"); + + do_sparse(&d, revs, show_commit, show_object, print_omitted_object, + ctx_data); +} diff --git a/list-objects-filter-sparse.h b/list-objects-filter-sparse.h new file mode 100644 index 00000000000000..6c715bf6e2197d --- /dev/null +++ b/list-objects-filter-sparse.h @@ -0,0 +1,30 @@ +#ifndef LIST_OBJECTS_FILTERS_SPARSE_H +#define LIST_OBJECTS_FILTERS_SPARSE_H + +#include "list-objects-filter-map.h" + +/* + * A filter driven by a sparse-checkout specification to only + * include blobs that a sparse checkout would populate. + * + * The sparse-checkout spec can be loaded from a blob with the + * given OID, a blob with a blob-ish path, or from a local pathname. + * We allow an OID because the repo may be bare or we may be doing + * the filtering on the server. + */ +void traverse_commit_list__sparse_oid( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + struct object_id *oid); +void traverse_commit_list__sparse_path( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *ctx_data, + const char *path); + +#endif /* LIST_OBJECTS_FILTERS_SPARSE_H */ diff --git a/list-objects.c b/list-objects.c index b3931fa434dc99..4ce25939c528d7 100644 --- a/list-objects.c +++ b/list-objects.c @@ -7,16 +7,22 @@ #include "tree-walk.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter-blobs-none.h" +#include "list-objects-filter-blobs-limit.h" +#include "list-objects-filter-sparse.h" static void process_blob(struct rev_info *revs, struct blob *blob, show_object_fn show, struct strbuf *path, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter, + void *filter_data) { struct object *obj = &blob->object; size_t pathlen; + list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->blob_objects) return; @@ -24,11 +30,15 @@ static void process_blob(struct rev_info *revs, die("bad blob object"); if (obj->flags & (UNINTERESTING | SEEN)) return; - obj->flags |= SEEN; pathlen = path->len; strbuf_addstr(path, name); - show(obj, path->buf, cb_data); + if (filter) + r = filter(LOFT_BLOB, obj, path->buf, &path->buf[pathlen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, path->buf, cb_data); strbuf_setlen(path, pathlen); } @@ -69,7 +79,9 @@ static void process_tree(struct rev_info *revs, show_object_fn show, struct strbuf *base, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter, + void *filter_data) { struct object *obj = &tree->object; struct tree_desc desc; @@ -77,6 +89,7 @@ static void process_tree(struct rev_info *revs, enum interesting match = revs->diffopt.pathspec.nr == 0 ? all_entries_interesting: entry_not_interesting; int baselen = base->len; + list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->tree_objects) return; @@ -90,9 +103,13 @@ static void process_tree(struct rev_info *revs, die("bad tree object %s", oid_to_hex(&obj->oid)); } - obj->flags |= SEEN; strbuf_addstr(base, name); - show(obj, base->buf, cb_data); + if (filter) + r = filter(LOFT_BEGIN_TREE, obj, base->buf, &base->buf[baselen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); if (base->len) strbuf_addch(base, '/'); @@ -112,7 +129,7 @@ static void process_tree(struct rev_info *revs, process_tree(revs, lookup_tree(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter, filter_data); else if (S_ISGITLINK(entry.mode)) process_gitlink(revs, entry.oid->hash, show, base, entry.path, @@ -121,8 +138,17 @@ static void process_tree(struct rev_info *revs, process_blob(revs, lookup_blob(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter, filter_data); } + + if (filter) { + r = filter(LOFT_END_TREE, obj, base->buf, &base->buf[baselen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); + } + strbuf_setlen(base, baselen); free_tree_buffer(tree); } @@ -183,10 +209,10 @@ static void add_pending_tree(struct rev_info *revs, struct tree *tree) add_pending_object(revs, &tree->object, ""); } -void traverse_commit_list(struct rev_info *revs, - show_commit_fn show_commit, - show_object_fn show_object, - void *data) +void traverse_commit_list_worker( + struct rev_info *revs, + show_commit_fn show_commit, show_object_fn show_object, void *show_data, + filter_object_fn filter, void *filter_data) { int i; struct commit *commit; @@ -200,7 +226,7 @@ void traverse_commit_list(struct rev_info *revs, */ if (commit->tree) add_pending_tree(revs, commit->tree); - show_commit(commit, data); + show_commit(commit, show_data); } for (i = 0; i < revs->pending.nr; i++) { struct object_array_entry *pending = revs->pending.objects + i; @@ -211,19 +237,19 @@ void traverse_commit_list(struct rev_info *revs, continue; if (obj->type == OBJ_TAG) { obj->flags |= SEEN; - show_object(obj, name, data); + show_object(obj, name, show_data); continue; } if (!path) path = ""; if (obj->type == OBJ_TREE) { process_tree(revs, (struct tree *)obj, show_object, - &base, path, data); + &base, path, show_data, filter, filter_data); continue; } if (obj->type == OBJ_BLOB) { process_blob(revs, (struct blob *)obj, show_object, - &base, path, data); + &base, path, show_data, filter, filter_data); continue; } die("unknown pending object %s (%s)", @@ -232,3 +258,56 @@ void traverse_commit_list(struct rev_info *revs, object_array_clear(&revs->pending); strbuf_release(&base); } + +void traverse_commit_list(struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data) +{ + traverse_commit_list_worker( + revs, + show_commit, show_object, show_data, + NULL, NULL); +} + +void traverse_commit_list_filtered( + struct list_objects_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *show_data) +{ + switch (filter_options->choice) { + case LOFC_DISABLED: + traverse_commit_list(revs, show_commit, show_object, show_data); + return; + + case LOFC_BLOB_NONE: + traverse_commit_list__blobs_none( + revs, show_commit, show_object, print_omitted_object, + show_data); + return; + + case LOFC_BLOB_LIMIT: + traverse_commit_list__blobs_limit( + revs, show_commit, show_object, print_omitted_object, + show_data, filter_options->blob_limit_value); + return; + + case LOFC_SPARSE_OID: + traverse_commit_list__sparse_oid( + revs, show_commit, show_object, print_omitted_object, + show_data, filter_options->sparse_oid_value); + return; + + case LOFC_SPARSE_PATH: + traverse_commit_list__sparse_path( + revs, show_commit, show_object, print_omitted_object, + show_data, filter_options->sparse_path_value); + return; + + default: + die("unspecified list-objects filter"); + } +} diff --git a/list-objects.h b/list-objects.h index 0cebf8585cb179..d14b0e048e646b 100644 --- a/list-objects.h +++ b/list-objects.h @@ -1,6 +1,9 @@ #ifndef LIST_OBJECTS_H #define LIST_OBJECTS_H +#include "list-objects-filter-map.h" +#include "list-objects-filter-options.h" + typedef void (*show_commit_fn)(struct commit *, void *); typedef void (*show_object_fn)(struct object *, const char *, void *); void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *); @@ -8,4 +11,42 @@ void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, voi typedef void (*show_edge_fn)(struct commit *); void mark_edges_uninteresting(struct rev_info *, show_edge_fn); -#endif +enum list_objects_filter_result { + LOFR_ZERO = 0, + LOFR_MARK_SEEN = 1<<0, + LOFR_SHOW = 1<<1, +}; + +/* See object.h and revision.h */ +#define FILTER_REVISIT (1<<25) + +enum list_objects_filter_type { + LOFT_BEGIN_TREE, + LOFT_END_TREE, + LOFT_BLOB +}; + +typedef enum list_objects_filter_result list_objects_filter_result; +typedef enum list_objects_filter_type list_objects_filter_type; + +typedef list_objects_filter_result (*filter_object_fn)( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data); + +void traverse_commit_list_worker( + struct rev_info *, + show_commit_fn, show_object_fn, void *show_data, + filter_object_fn filter, void *filter_data); + +void traverse_commit_list_filtered( + struct list_objects_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + list_objects_filter_map_foreach_cb print_omitted_object, + void *show_data); + +#endif /* LIST_OBJECTS_H */ diff --git a/partial-clone-utils.c b/partial-clone-utils.c new file mode 100644 index 00000000000000..8c925ae6208813 --- /dev/null +++ b/partial-clone-utils.c @@ -0,0 +1,99 @@ +#include "cache.h" +#include "config.h" +#include "partial-clone-utils.h" + +int is_partial_clone_registered(void) +{ + if (repository_format_partial_clone_remote || + repository_format_partial_clone_filter) + return 1; + + return 0; +} + +void partial_clone_utils_register( + const struct list_objects_filter_options *filter_options, + const char *remote, + const char *cmd_name) +{ + struct strbuf buf = STRBUF_INIT; + + if (is_partial_clone_registered()) { + /* + * The original partial-clone or a previous partial-fetch + * already registered the partial-clone settings. + * If we get here, we are in a subsequent partial-* command + * (with explicit filter args on the command line). + * + * For now, we restrict subsequent commands to one + * consistent with the original request. We may relax + * this later after we get more experience with the + * partial-clone feature. + * + * [] Restrict to same remote because our dynamic + * object loading only knows how to fetch objects + * from 1 remote. + */ + assert(filter_options && filter_options->choice); + assert(remote && *remote); + + if (strcmp(remote, repository_format_partial_clone_remote)) + die("%s --%s currently limited to remote '%s'", + cmd_name, CL_ARG__FILTER, + repository_format_partial_clone_remote); + + /* + * Treat the (possibly new) filter-spec as transient; + * use it for the current command, but do not overwrite + * the default. + */ + return; + } + + repository_format_partial_clone_remote = xstrdup(remote); + repository_format_partial_clone_filter = xstrdup(filter_options->raw_value); + + /* + * Force repo version > 0 to enable extensions namespace. + */ + git_config_set("core.repositoryformatversion", "1"); + + /* + * Use the "extensions" namespace in the config to record + * the name of the remote used in the partial clone. + * This will help us return to that server when we need + * to backfill missing objects. + * + * It is also used to indicate that there *MAY* be + * missing objects so that subsequent commands don't + * immediately die if they hit one. + * + * Also remember the initial filter settings used by + * clone as a default for future fetches. + */ + git_config_set("extensions." KEY_PARTIALCLONEREMOTE, + repository_format_partial_clone_remote); + git_config_set("extensions." KEY_PARTIALCLONEFILTER, + repository_format_partial_clone_filter); + + /* + * TODO Do we need to record both partial-clone + * parameters in the extensions namespace and in the + * section for the remote? + * + * Or should we just remember 1 in each, as in: + * "extension.partialcloneremote=" + * "remote..filter=" + * The issue is when can we set both of the + * repository_format_partial_clone_* globals + * durint subsequent startups. + * See setup.c:check_repo_format(). + */ + strbuf_addf(&buf, "remote.%s.%s", remote, KEY_PARTIALCLONEREMOTE); + git_config_set(buf.buf, repository_format_partial_clone_remote); + + strbuf_addf(&buf, "remote.%s.%s", remote, KEY_PARTIALCLONEFILTER); + git_config_set(buf.buf, repository_format_partial_clone_filter); + + strbuf_release(&buf); +} diff --git a/partial-clone-utils.h b/partial-clone-utils.h new file mode 100644 index 00000000000000..b52757084d1b81 --- /dev/null +++ b/partial-clone-utils.h @@ -0,0 +1,34 @@ +#ifndef PARTIAL_CLONE_UTILS_H +#define PARTIAL_CLONE_UTILS_H + +#include "list-objects-filter-options.h" + +/* + * Register that partial-clone was used to create the repo and + * update the config on disk. + * + * If nothing else, this indicates that the ODB may have missing + * objects and that various commands should handle that gracefully. + * + * Record the remote used for the clone so that we know where + * to get missing objects in the future. + * + * Also record the filter expression so that we know something + * about the missing objects (e.g., size-limit vs sparse). + * + * May also be used by a partial-fetch following a normal clone + * to turn on the above tracking. + */ +extern void partial_clone_utils_register( + const struct list_objects_filter_options *filter_options, + const char *remote, + const char *cmd_name); + +/* + * Return 1 if partial-clone was used to create the repo + * or a subsequent partial-fetch was used. This is an + * indicator that there may be missing objects. + */ +extern int is_partial_clone_registered(void); + +#endif /* PARTIAL_CLONE_UTILS_H */ diff --git a/remote-curl.c b/remote-curl.c index 0053b09549ab41..44ceaaed734a31 100644 --- a/remote-curl.c +++ b/remote-curl.c @@ -13,6 +13,7 @@ #include "credential.h" #include "sha1-array.h" #include "send-pack.h" +#include "list-objects-filter-options.h" static struct remote *remote; /* always ends with a trailing slash */ @@ -22,6 +23,7 @@ struct options { int verbosity; unsigned long depth; char *deepen_since; + char *partial_clone_filter; struct string_list deepen_not; struct string_list push_options; unsigned progress : 1, @@ -157,6 +159,9 @@ static int set_option(const char *name, const char *value) return -1; return 0; #endif /* LIBCURL_VERSION_NUM >= 0x070a08 */ + } else if (!strcmp(name, REMOTE_KEY_PARTIAL_CLONE_FILTER)) { + options.partial_clone_filter = xstrdup(value); + return 0; } else { return 1 /* unsupported */; } @@ -822,6 +827,10 @@ static int fetch_git(struct discovery *heads, options.deepen_not.items[i].string); if (options.deepen_relative && options.depth) argv_array_push(&args, "--deepen-relative"); + if (options.partial_clone_filter) + argv_array_pushf(&args, "--%s=%s", + CL_ARG__FILTER, options.partial_clone_filter); + argv_array_push(&args, url.buf); for (i = 0; i < nr_heads; i++) { diff --git a/remote.c b/remote.c index b220f0dfc619a6..6874c5fed293ee 100644 --- a/remote.c +++ b/remote.c @@ -440,6 +440,8 @@ static int handle_config(const char *key, const char *value, void *cb) key, value); } else if (!strcmp(subkey, "vcs")) { return git_config_string(&remote->foreign_vcs, key, value); + } else if (!strcmp(subkey, REMOTE_KEY_PARTIAL_CLONE_FILTER)) { + return git_config_string(&remote->partial_clone_filter, key, value); } return 0; } diff --git a/remote.h b/remote.h index 2ecf4c8c74ce59..7c2267fc872b4b 100644 --- a/remote.h +++ b/remote.h @@ -56,8 +56,12 @@ struct remote { */ char *http_proxy; char *http_proxy_authmethod; + + const char *partial_clone_filter; }; +#define REMOTE_KEY_PARTIAL_CLONE_FILTER "partialclonefilter" + struct remote *remote_get(const char *name); struct remote *pushremote_get(const char *name); int remote_is_configured(struct remote *remote, int in_repo); diff --git a/setup.c b/setup.c index 03f51e056cd6e6..bc4133dd39f82c 100644 --- a/setup.c +++ b/setup.c @@ -420,6 +420,19 @@ static int check_repo_format(const char *var, const char *value, void *vdata) ; else if (!strcmp(ext, "preciousobjects")) data->precious_objects = git_config_bool(var, value); + + else if (!strcmp(ext, KEY_PARTIALCLONEREMOTE)) + if (!value) + return config_error_nonbool(var); + else + data->partial_clone_remote = xstrdup(value); + + else if (!strcmp(ext, KEY_PARTIALCLONEFILTER)) + if (!value) + return config_error_nonbool(var); + else + data->partial_clone_filter = xstrdup(value); + else string_list_append(&data->unknown_extensions, ext); } else if (strcmp(var, "core.bare") == 0) { @@ -463,6 +476,8 @@ static int check_repository_format_gently(const char *gitdir, int *nongit_ok) } repository_format_precious_objects = candidate.precious_objects; + repository_format_partial_clone_remote = candidate.partial_clone_remote; + repository_format_partial_clone_filter = candidate.partial_clone_filter; string_list_clear(&candidate.unknown_extensions, 0); if (!has_common) { if (candidate.is_bare != -1) { diff --git a/t/t5317-pack-objects-filter-objects.sh b/t/t5317-pack-objects-filter-objects.sh new file mode 100755 index 00000000000000..ef7a8f60e60b88 --- /dev/null +++ b/t/t5317-pack-objects-filter-objects.sh @@ -0,0 +1,384 @@ +#!/bin/sh + +test_description='git pack-objects with object filtering for partial clone' + +. ./test-lib.sh + +# Test blob:none filter. + +test_expect_success 'setup r1' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init r1 && + for n in 1 2 3 4 5 + do + echo "This is file: $n" > r1/file.$n + git -C r1 add file.$n + git -C r1 commit -m "$n" + done +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r1 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r1 index-pack ../all.pack && + git -C r1 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:none packfile has no blobs' ' + git -C r1 pack-objects --rev --stdout --filter=blob:none >filter.pack <<-EOF && + HEAD + EOF + git -C r1 index-pack ../filter.pack && + git -C r1 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l expected && + git -C r1 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test blob:limit=[kmg] filter. +# We boundary test around the size parameter. The filter is strictly less than +# the value, so size 500 and 1000 should have the same results, but 1001 should +# filter more. + +test_expect_success 'setup r2' ' + git init r2 && + for n in 1000 10000 + do + printf "%"$n"s" X > r2/large.$n + git -C r2 add large.$n + git -C r2 commit -m "$n" + done +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../all.pack && + git -C r2 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=500 omits all blobs' ' + git -C r2 pack-objects --rev --stdout --filter=blob:limit=500 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1001 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=10001' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=10001 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1k' ' + git -C r2 ls-files -s large.1000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1k >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1m' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1m >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and blob:limit packfiles have same commits/trees' ' + git -C r2 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r2 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:path= filter. +# Use a local file containing a sparse-checkout specification to filter +# out blobs not required for the corresponding sparse-checkout. We do not +# require sparse-checkout to actually be enabled. + +test_expect_success 'setup r3' ' + git init r3 && + mkdir r3/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r3/$n + git -C r3 add $n + echo "This is file: dir1/$n" > r3/dir1/$n + git -C r3 add dir1/$n + done && + git -C r3 commit -m "sparse" && + echo dir1/ >pattern1 && + echo sparse1 >pattern2 +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r3 ls-files -s sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../all.pack && + git -C r3 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern1' ' + git -C r3 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout --filter=sparse:path=../pattern1 >filter.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../filter.pack && + git -C r3 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and sparse:path=pattern1 packfiles have same commits/trees' ' + git -C r3 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r3 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern2' ' + git -C r3 ls-files -s sparse1 dir1/sparse1 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout --filter=sparse:path=../pattern2 >filter.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../filter.pack && + git -C r3 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and sparse:path=pattern2 packfiles have same commits/trees' ' + git -C r3 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r3 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:oid= filter. +# Like sparse:path, but we get the sparse-checkout specification from +# a blob rather than a file on disk. + +test_expect_success 'setup r4' ' + git init r4 && + mkdir r4/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r4/$n + git -C r4 add $n + echo "This is file: dir1/$n" > r4/dir1/$n + git -C r4 add dir1/$n + done && + echo dir1/ >r4/pattern && + git -C r4 add pattern && + git -C r4 commit -m "pattern" +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r4 ls-files -s pattern sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r4 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../all.pack && + git -C r4 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=OID' ' + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + oid=$(git -C r4 ls-files -s pattern | awk -f print_2.awk) && + git -C r4 pack-objects --rev --stdout --filter=sparse:oid=$oid >filter.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../filter.pack && + git -C r4 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=oid-ish' ' + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r4 pack-objects --rev --stdout --filter=sparse:oid=master:pattern >filter.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../filter.pack && + git -C r4 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Delete some loose objects and use pack-objects, but WITHOUT any filtering. +# This models previously omitted objects that we did not receive. + +test_expect_success 'setup r1 - delete loose blobs' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + for id in `cat expected | sed "s|..|&/|"` + do + rm r1/.git/objects/$id + done +' + +test_expect_success 'verify pack-objects fails w/ missing objects' ' + test_must_fail git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +if ! test_have_prereq TODO; then + skip_all='TODO Allow pack-objects to work with missing objects' + test_done +fi + +test_expect_success 'verify pack-objects w/ extension.partialcloneremote set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialcloneremote "origin" && + git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +test_expect_success 'veify pack-objects w/ extension.partialclonefilter set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialclonefilter "something" && + git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +test_done diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh new file mode 100755 index 00000000000000..26fa12fed3d0ca --- /dev/null +++ b/t/t6112-rev-list-filters-objects.sh @@ -0,0 +1,223 @@ +#!/bin/sh + +test_description='git rev-list with object filtering for partial clone' + +. ./test-lib.sh + +# Test the blob:none filter. + +test_expect_success 'setup r1' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init r1 && + for n in 1 2 3 4 5 + do + echo "This is file: $n" > r1/file.$n + git -C r1 add file.$n + git -C r1 commit -m "$n" + done +' + +test_expect_success 'verify blob:none omits all 5 blobs' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r1 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:none \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify emitted+omitted == all' ' + git -C r1 rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git -C r1 rev-list HEAD --objects --filter-print-omitted --filter=blob:none \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + + +# Test blob:limit=[kmg] filter. +# We boundary test around the size parameter. The filter is strictly less than +# the value, so size 500 and 1000 should have the same results, but 1001 should +# filter more. + +test_expect_success 'setup r2' ' + git init r2 && + for n in 1000 10000 + do + printf "%"$n"s" X > r2/large.$n + git -C r2 add large.$n + git -C r2 commit -m "$n" + done +' + +test_expect_success 'verify blob:limit=500 omits all blobs' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=500 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify emitted+omitted == all' ' + git -C r2 rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git -C r2 rev-list HEAD --objects --filter-print-omitted --filter=blob:limit=500 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1000' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1000 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1001' ' + git -C r2 ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1001 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1k' ' + git -C r2 ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1k \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1m' ' + cat expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1m \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +# Test sparse:path= filter. +# Use a local file containing a sparse-checkout specification to filter +# out blobs not required for the corresponding sparse-checkout. We do not +# require sparse-checkout to actually be enabled. + +test_expect_success 'setup r3' ' + git init r3 && + mkdir r3/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r3/$n + git -C r3 add $n + echo "This is file: dir1/$n" > r3/dir1/$n + git -C r3 add dir1/$n + done && + git -C r3 commit -m "sparse" && + echo dir1/ >pattern1 && + echo sparse1 >pattern2 +' + +test_expect_success 'verify sparse:path=pattern1 omits top-level files' ' + git -C r3 ls-files -s sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:path=../pattern1 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern2 omits both sparse2 files' ' + git -C r3 ls-files -s sparse2 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:path=../pattern2 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +# Test sparse:oid= filter. +# Like sparse:path, but we get the sparse-checkout specification from +# a blob rather than a file on disk. + +test_expect_success 'setup r3 part 2' ' + echo dir1/ >r3/pattern && + git -C r3 add pattern && + git -C r3 commit -m "pattern" +' + +test_expect_success 'verify sparse:oid=OID omits top-level files' ' + git -C r3 ls-files -s pattern sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + oid=$(git -C r3 ls-files -s pattern | awk -f print_2.awk) && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:oid=$oid \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=oid-ish omits top-level files' ' + git -C r3 ls-files -s pattern sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:oid=master:pattern \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +# Delete some loose objects and use rev-list, but WITHOUT any filtering. +# This models previously omitted objects that we did not receive. + +test_expect_success 'rev-list W/ print-missing' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + for id in `cat expected | sed "s|..|&/|"` + do + rm r1/.git/objects/$id + done && + git -C r1 rev-list --quiet HEAD --filter-print-missing --objects \ + | awk -f print_1.awk \ + | sed "s/?//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'rev-list W/O print-missing fails' ' + test_must_fail git -C r1 rev-list --quiet --objects HEAD +' + +test_expect_success 'rev-list W/ extension.partialcloneremote set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialcloneremote "origin" && + git -C r1 rev-list --quiet --objects HEAD +' + +test_expect_success 'rev-list W/ extension.partialclonefilter set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialclonefilter "something" && + git -C r1 rev-list --quiet --objects HEAD +' + +test_done diff --git a/transport-helper.c b/transport-helper.c index c948d5215c22fb..86a8abe97039dd 100644 --- a/transport-helper.c +++ b/transport-helper.c @@ -671,6 +671,11 @@ static int fetch(struct transport *transport, if (data->transport_options.update_shallow) set_helper_option(transport, "update-shallow", "true"); + if (data->transport_options.filter_options.choice) + set_helper_option( + transport, TRANS_OPT_LIST_OBJECTS_FILTER, + data->transport_options.filter_options.raw_value); + if (data->fetch) return fetch_with_fetch(transport, nr_heads, to_fetch); diff --git a/transport.c b/transport.c index f1e2f61991424f..1629df35890b1e 100644 --- a/transport.c +++ b/transport.c @@ -161,6 +161,10 @@ static int set_git_option(struct git_transport_options *opts, opts->deepen_relative = !!value; return 0; } + else if (!strcmp(name, TRANS_OPT_LIST_OBJECTS_FILTER)) { + parse_list_objects_filter(&opts->filter_options, value); + return 0; + } return 1; } @@ -228,6 +232,7 @@ static int fetch_refs_via_pack(struct transport *transport, data->options.check_self_contained_and_connected; args.cloning = transport->cloning; args.update_shallow = data->options.update_shallow; + args.filter_options = data->options.filter_options; if (!data->got_remote_heads) { connect_setup(transport, 0); diff --git a/transport.h b/transport.h index bc5571574b6780..23e622b318dbfd 100644 --- a/transport.h +++ b/transport.h @@ -4,6 +4,7 @@ #include "cache.h" #include "run-command.h" #include "remote.h" +#include "list-objects-filter-options.h" struct string_list; @@ -21,6 +22,7 @@ struct git_transport_options { const char *uploadpack; const char *receivepack; struct push_cas_option *cas; + struct list_objects_filter_options filter_options; }; enum transport_family { @@ -210,6 +212,9 @@ void transport_check_allowed(const char *type); /* Send push certificates */ #define TRANS_OPT_PUSH_CERT "pushcert" +/* See Documentation/technical/pack-protocol.txt */ +#define TRANS_OPT_LIST_OBJECTS_FILTER CL_ARG__FILTER + /** * Returns 0 if the option was used, non-zero otherwise. Prints a * message to stderr if the option is not used. diff --git a/upload-pack.c b/upload-pack.c index e25f725c0feaa5..ff88ea17541002 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -18,6 +18,7 @@ #include "parse-options.h" #include "argv-array.h" #include "prio-queue.h" +#include "list-objects-filter-options.h" static const char * const upload_pack_usage[] = { N_("git upload-pack [] "), @@ -64,6 +65,9 @@ static int advertise_refs; static int stateless_rpc; static const char *pack_objects_hook; +static int capability_filter_objects_requested; +static struct list_objects_filter_options filter_options; + static void reset_timeout(void) { alarm(timeout); @@ -132,6 +136,14 @@ static void create_pack_file(void) if (use_include_tag) argv_array_push(&pack_objects.args, "--include-tag"); + /* + * TODO Do we need to quote raw_value? + */ + if (filter_options.choice) + argv_array_pushf(&pack_objects.args, "--%s=%s", + CL_ARG__FILTER, + filter_options.raw_value); + pack_objects.in = -1; pack_objects.out = -1; pack_objects.err = -1; @@ -794,6 +806,12 @@ static void receive_needs(void) deepen_rev_list = 1; continue; } + if (skip_prefix(line, (CL_ARG__FILTER " "), &arg)) { + parse_list_objects_filter(&filter_options, arg); + if (filter_options.choice && !capability_filter_objects_requested) + die("git upload-pack: filtering capability not negotiated"); + continue; + } if (!skip_prefix(line, "want ", &arg) || get_oid_hex(arg, &oid_buf)) die("git upload-pack: protocol error, " @@ -821,6 +839,8 @@ static void receive_needs(void) no_progress = 1; if (parse_feature_request(features, "include-tag")) use_include_tag = 1; + if (parse_feature_request(features, CL_ARG__FILTER)) + capability_filter_objects_requested = 1; o = parse_object(&oid_buf); if (!o) { @@ -929,7 +949,8 @@ static int send_ref(const char *refname, const struct object_id *oid, { static const char *capabilities = "multi_ack thin-pack side-band" " side-band-64k ofs-delta shallow deepen-since deepen-not" - " deepen-relative no-progress include-tag multi_ack_detailed"; + " deepen-relative no-progress include-tag multi_ack_detailed" + " " CL_ARG__FILTER; const char *refname_nons = strip_namespace(refname); struct object_id peeled;