Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pack-objects: create new name-hash algorithm #5157

Merged
merged 11 commits into from
Sep 24, 2024
3 changes: 2 additions & 1 deletion Documentation/git-pack-objects.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ SYNOPSIS
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
[--shallow] [--keep-true-parents] [--[no-]sparse]
[--full-name-hash] < <object-list>


DESCRIPTION
Expand Down
4 changes: 3 additions & 1 deletion Documentation/git-repack.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
SYNOPSIS
--------
[verse]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
[--write-midx] [--full-name-hash]

DESCRIPTION
-----------
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ TEST_BUILTINS_OBJS += test-match-trees.o
TEST_BUILTINS_OBJS += test-mergesort.o
TEST_BUILTINS_OBJS += test-mktemp.o
TEST_BUILTINS_OBJS += test-oid-array.o
TEST_BUILTINS_OBJS += test-name-hash.o
TEST_BUILTINS_OBJS += test-online-cpus.o
TEST_BUILTINS_OBJS += test-pack-mtimes.o
TEST_BUILTINS_OBJS += test-parse-options.o
Expand Down
25 changes: 20 additions & 5 deletions builtin/pack-objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,14 @@ struct configured_exclusion {
static struct oidmap configured_exclusions;

static struct oidset excluded_by_config;
static int use_full_name_hash = -1;

static inline uint32_t pack_name_hash_fn(const char *name)
{
if (use_full_name_hash)
return pack_full_name_hash(name);
return pack_name_hash(name);
}

/*
* stats
Expand Down Expand Up @@ -1670,7 +1678,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
return 0;
}

create_object_entry(oid, type, pack_name_hash(name),
create_object_entry(oid, type, pack_name_hash_fn(name),
exclude, name && no_try_delta(name),
found_pack, found_offset);
return 1;
Expand Down Expand Up @@ -1884,7 +1892,7 @@ static void add_preferred_base_object(const char *name)
{
struct pbase_tree *it;
size_t cmplen;
unsigned hash = pack_name_hash(name);
unsigned hash = pack_name_hash_fn(name);

if (!num_preferred_base || check_pbase_path(hash))
return;
Expand Down Expand Up @@ -3394,7 +3402,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* here using a now in order to perhaps improve the delta selection
* process.
*/
oe->hash = pack_name_hash(name);
oe->hash = pack_name_hash_fn(name);
oe->no_try_delta = name && no_try_delta(name);

stdin_packs_hints_nr++;
Expand Down Expand Up @@ -3544,7 +3552,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
entry = packlist_find(&to_pack, oid);
if (entry) {
if (name) {
entry->hash = pack_name_hash(name);
entry->hash = pack_name_hash_fn(name);
entry->no_try_delta = no_try_delta(name);
}
} else {
Expand All @@ -3567,7 +3575,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
return;
}

entry = create_object_entry(oid, type, pack_name_hash(name),
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
0, name && no_try_delta(name),
pack, offset);
}
Expand Down Expand Up @@ -4397,6 +4405,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
N_("protocol"),
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
OPT_END(),
};

Expand Down Expand Up @@ -4544,6 +4554,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
if (pack_to_stdout || !rev_list_all)
write_bitmap_index = 0;

if (write_bitmap_index && use_full_name_hash > 0)
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
if (use_full_name_hash < 0)
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);

if (use_delta_islands)
strvec_push(&rp, "--topo-order");

Expand Down
9 changes: 8 additions & 1 deletion builtin/repack.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ static int run_update_server_info = 1;
static char *packdir, *packtmp_name, *packtmp;

static const char *const git_repack_usage[] = {
N_("git repack [<options>]"),
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
"[--write-midx] [--full-name-hash]"),
NULL
};

Expand All @@ -57,6 +59,7 @@ struct pack_objects_args {
int no_reuse_object;
int quiet;
int local;
int full_name_hash;
struct list_objects_filter_options filter_options;
};

Expand Down Expand Up @@ -288,6 +291,8 @@ static void prepare_pack_objects(struct child_process *cmd,
strvec_pushf(&cmd->args, "--no-reuse-delta");
if (args->no_reuse_object)
strvec_pushf(&cmd->args, "--no-reuse-object");
if (args->full_name_hash)
strvec_pushf(&cmd->args, "--full-name-hash");
if (args->local)
strvec_push(&cmd->args, "--local");
if (args->quiet)
Expand Down Expand Up @@ -1157,6 +1162,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
N_("pass --no-reuse-delta to git-pack-objects")),
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
N_("pass --no-reuse-object to git-pack-objects")),
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
OPT_NEGBIT('n', NULL, &run_update_server_info,
N_("do not run git-update-server-info"), 1),
OPT__QUIET(&po_args.quiet, N_("be quiet")),
Expand Down
1 change: 1 addition & 0 deletions ci/run-build-and-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ linux-TEST-vars)
export GIT_TEST_NO_WRITE_REV_INDEX=1
export GIT_TEST_CHECKOUT_WORKERS=2
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
export GIT_TEST_FULL_NAME_HASH=1
;;
linux-clang)
export GIT_TEST_DEFAULT_HASH=sha1
Expand Down
21 changes: 21 additions & 0 deletions pack-objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,27 @@ static inline uint32_t pack_name_hash(const char *name)
return hash;
}

static inline uint32_t pack_full_name_hash(const char *name)
{
const uint32_t bigp = 1234572167U;
uint32_t c, hash = bigp;

if (!name)
return 0;

/*
* Do the simplest thing that will resemble pseudo-randomness: add
* random multiples of a large prime number with a binary shift.
* The goal is not to be cryptographic, but to be generally
* uniformly distributed.
*/
while ((c = *name++) != 0) {
hash += c * bigp;
hash = (hash >> 5) | (hash << 27);
}
return hash;
}

static inline enum object_type oe_type(const struct object_entry *e)
{
return e->type_valid ? e->type_ : OBJ_BAD;
Expand Down
4 changes: 4 additions & 0 deletions t/README
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,10 @@ a test and then fails then the whole test run will abort. This can help to make
sure the expected tests are executed and not silently skipped when their
dependency breaks or is simply not present in a new environment.

GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
function in 'git pack-objects' to be the one used by the --full-name-hash
option.

GIT_TEST_FSCACHE=<boolean> exercises the uncommon fscache code path
which adds a cache below mingw's lstat and dirent implementations.

Expand Down
24 changes: 24 additions & 0 deletions t/helper/test-name-hash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* test-name-hash.c: Read a list of paths over stdin and report on their
* name-hash and full name-hash.
*/

#include "test-tool.h"
#include "git-compat-util.h"
#include "pack-objects.h"
#include "strbuf.h"

int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
{
struct strbuf line = STRBUF_INIT;

while (!strbuf_getline(&line, stdin)) {
uint32_t name_hash = pack_name_hash(line.buf);
uint32_t full_hash = pack_full_name_hash(line.buf);

printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
}

strbuf_release(&line);
return 0;
}
1 change: 1 addition & 0 deletions t/helper/test-tool.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
{ "match-trees", cmd__match_trees },
{ "mergesort", cmd__mergesort },
{ "mktemp", cmd__mktemp },
{ "name-hash", cmd__name_hash },
{ "oid-array", cmd__oid_array },
{ "online-cpus", cmd__online_cpus },
{ "pack-mtimes", cmd__pack_mtimes },
Expand Down
1 change: 1 addition & 0 deletions t/helper/test-tool.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
int cmd__match_trees(int argc, const char **argv);
int cmd__mergesort(int argc, const char **argv);
int cmd__mktemp(int argc, const char **argv);
int cmd__name_hash(int argc, const char **argv);
int cmd__online_cpus(int argc, const char **argv);
int cmd__pack_mtimes(int argc, const char **argv);
int cmd__parse_options(int argc, const char **argv);
Expand Down
73 changes: 73 additions & 0 deletions t/perf/p5313-pack-objects.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_expect_success 'create rev input' '
cat >in-thin <<-EOF &&
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1)
EOF

cat >in-big <<-EOF
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1000)
EOF
'

test_perf 'thin pack' '
git pack-objects --thin --stdout --revs --sparse <in-thin >out
'

test_size 'thin pack size' '
test_file_size out
'

test_perf 'thin pack with --full-name-hash' '
git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
'

test_size 'thin pack size with --full-name-hash' '
test_file_size out
'

test_perf 'big pack' '
git pack-objects --stdout --revs --sparse <in-big >out
'

test_size 'big pack size' '
test_file_size out
'

test_perf 'big pack with --full-name-hash' '
git pack-objects --stdout --revs --sparse --full-name-hash <in-big >out
'

test_size 'big pack size with --full-name-hash' '
test_file_size out
'

test_perf 'repack' '
git repack -adf
'

test_size 'repack size' '
pack=$(ls .git/objects/pack/pack-*.pack) &&
test_file_size "$pack"
'

test_perf 'repack with --full-name-hash' '
git repack -adf --full-name-hash
'

test_size 'repack size with --full-name-hash' '
pack=$(ls .git/objects/pack/pack-*.pack) &&
test_file_size "$pack"
'

test_done
41 changes: 41 additions & 0 deletions t/perf/p5314-name-hash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_size 'paths at head' '
git ls-tree -r --name-only HEAD >path-list &&
wc -l <path-list
'

test_size 'number of distinct name-hashes' '
cat path-list | test-tool name-hash >name-hashes &&
cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
wc -l <name-hash-count
'

test_size 'number of distinct full-name-hashes' '
cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
wc -l <full-name-hash-count
'

test_size 'maximum multiplicity of name-hashes' '
cat name-hash-count | \
sort -nr | \
head -n 1 | \
awk "{ print \$1; }"
'

test_size 'maximum multiplicity of fullname-hashes' '
cat full-name-hash-count | \
sort -nr | \
head -n 1 | \
awk "{ print \$1; }"
'

test_done
1 change: 0 additions & 1 deletion t/t0450/txt-help-mismatches
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ rebase
remote
remote-ext
remote-fd
repack
reset
restore
rev-parse
Expand Down
15 changes: 15 additions & 0 deletions t/t5300-pack-object.sh
Original file line number Diff line number Diff line change
Expand Up @@ -674,4 +674,19 @@ do
'
done

# The following test is not necessarily a permanent choice, but since we do not
# have a "name hash version" bit in the .bitmap file format, we cannot write the
# full-name hash values into the .bitmap file without risking breakage later.
#
# TODO: Make these compatible in the future and replace this test with the
# expected behavior when both are specified.
test_expect_success '--full-name-hash and --write-bitmap-index are incompatible' '
test_must_fail git pack-objects base --all \
--full-name-hash --write-bitmap-index 2>err &&
grep incompatible err &&

# --stdout option silently removes --write-bitmap-index
git pack-objects --stdout --all --full-name-hash --write-bitmap-index >out
'

test_done
Loading
Loading