From 3a8cd93b061b3afad53aa33405a91434492617a4 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 29 Apr 2024 08:55:03 -0400 Subject: [PATCH 01/10] survey: stub in new experimental 'git-survey' command Start work on a new 'git survey' command to scan the repository for monorepo performance and scaling problems. The goal is to measure the various known "dimensions of scale" and serve as a foundation for adding additional measurements as we learn more about Git monorepo scaling problems. The initial goal is to complement the scanning and analysis performed by the GO-based 'git-sizer' (https://github.com/github/git-sizer) tool. It is hoped that by creating a builtin command, we may be able to take advantage of internal Git data structures and code that is not accessible from GO to gain further insight into potential scaling problems. Co-authored-by: Derrick Stolee Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- .gitignore | 1 + Documentation/config.txt | 2 + Documentation/config/survey.txt | 11 +++++ Documentation/git-survey.txt | 36 ++++++++++++++++ Makefile | 1 + builtin.h | 1 + builtin/survey.c | 75 +++++++++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + t/t8100-git-survey.sh | 18 ++++++++ 10 files changed, 147 insertions(+) create mode 100644 Documentation/config/survey.txt create mode 100644 Documentation/git-survey.txt create mode 100644 builtin/survey.c create mode 100755 t/t8100-git-survey.sh diff --git a/.gitignore b/.gitignore index 7ad711cf6f4672..a8bded84ad8b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,7 @@ /git-submodule /git-submodule--helper /git-subtree +/git-survey /git-svn /git-switch /git-symbolic-ref diff --git a/Documentation/config.txt b/Documentation/config.txt index fedfaf30cd0d8b..939cc1387992f8 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -536,6 +536,8 @@ include::config/status.txt[] include::config/submodule.txt[] +include::config/survey.txt[] + include::config/tag.txt[] include::config/tar.txt[] diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt new file mode 100644 index 00000000000000..c1b0f852a1250e --- /dev/null +++ b/Documentation/config/survey.txt @@ -0,0 +1,11 @@ +survey.*:: + These variables adjust the default behavior of the `git survey` + command. The intention is that this command could be run in the + background with these options. ++ +-- + verbose:: + This boolean value implies the `--[no-]verbose` option. + progress:: + This boolean value implies the `--[no-]progress` option. +-- diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt new file mode 100644 index 00000000000000..cdd1ec4358b8bb --- /dev/null +++ b/Documentation/git-survey.txt @@ -0,0 +1,36 @@ +git-survey(1) +============= + +NAME +---- +git-survey - EXPERIMENTAL: Measure various repository dimensions of scale + +SYNOPSIS +-------- +[verse] +(EXPERIMENTAL!) `git survey` + +DESCRIPTION +----------- + +Survey the repository and measure various dimensions of scale. + +As repositories grow to "monorepo" size, certain data shapes can cause +performance problems. `git-survey` attempts to measure and report on +known problem areas. + +OPTIONS +------- + +--progress:: + Show progress. This is automatically enabled when interactive. + +OUTPUT +------ + +By default, `git survey` will print information about the repository in a +human-readable format that includes overviews and tables. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index 82f60cbbfdba6e..d771344dc2841c 100644 --- a/Makefile +++ b/Makefile @@ -1312,6 +1312,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o +BUILTIN_OBJS += builtin/survey.o BUILTIN_OBJS += builtin/symbolic-ref.o BUILTIN_OBJS += builtin/tag.o BUILTIN_OBJS += builtin/unpack-file.o diff --git a/builtin.h b/builtin.h index 73dd0ccbe8c961..d4e8cf3b97b590 100644 --- a/builtin.h +++ b/builtin.h @@ -239,6 +239,7 @@ int cmd_status(int argc, const char **argv, const char *prefix); int cmd_stash(int argc, const char **argv, const char *prefix); int cmd_stripspace(int argc, const char **argv, const char *prefix); int cmd_submodule__helper(int argc, const char **argv, const char *prefix); +int cmd_survey(int argc, const char **argv, const char *prefix); int cmd_switch(int argc, const char **argv, const char *prefix); int cmd_symbolic_ref(int argc, const char **argv, const char *prefix); int cmd_tag(int argc, const char **argv, const char *prefix); diff --git a/builtin/survey.c b/builtin/survey.c new file mode 100644 index 00000000000000..f80fd04a65f1c2 --- /dev/null +++ b/builtin/survey.c @@ -0,0 +1,75 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "builtin.h" +#include "config.h" +#include "parse-options.h" + +static const char * const survey_usage[] = { + N_("(EXPERIMENTAL!) git survey "), + NULL, +}; + +struct survey_opts { + int verbose; + int show_progress; +}; + +struct survey_context { + struct repository *repo; + + /* Options that control what is done. */ + struct survey_opts opts; +}; + +static int survey_load_config_cb(const char *var, const char *value, + const struct config_context *cctx, void *pvoid) +{ + struct survey_context *ctx = pvoid; + + if (!strcmp(var, "survey.verbose")) { + ctx->opts.verbose = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.progress")) { + ctx->opts.show_progress = git_config_bool(var, value); + return 0; + } + + return git_default_config(var, value, cctx, pvoid); +} + +static void survey_load_config(struct survey_context *ctx) +{ + git_config(survey_load_config_cb, ctx); +} + +int cmd_survey(int argc, const char **argv, const char *prefix) +{ + static struct survey_context ctx = { + .opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ + }, + }; + + static struct option survey_options[] = { + OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(survey_usage, survey_options); + + ctx.repo = the_repository; + + prepare_repo_settings(ctx.repo); + survey_load_config(&ctx); + + argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); + + if (ctx.opts.show_progress < 0) + ctx.opts.show_progress = isatty(2); + + return 0; +} diff --git a/command-list.txt b/command-list.txt index c537114b4687b8..ecc9d2281a0909 100644 --- a/command-list.txt +++ b/command-list.txt @@ -187,6 +187,7 @@ git-stash mainporcelain git-status mainporcelain info git-stripspace purehelpers git-submodule mainporcelain +git-survey mainporcelain git-svn foreignscminterface git-switch mainporcelain history git-symbolic-ref plumbingmanipulators diff --git a/git.c b/git.c index dd6fcfad0fe46c..3c23bf7aa45432 100644 --- a/git.c +++ b/git.c @@ -623,6 +623,7 @@ static struct cmd_struct commands[] = { { "status", cmd_status, RUN_SETUP | NEED_WORK_TREE }, { "stripspace", cmd_stripspace }, { "submodule--helper", cmd_submodule__helper, RUN_SETUP }, + { "survey", cmd_survey, RUN_SETUP }, { "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE }, { "symbolic-ref", cmd_symbolic_ref, RUN_SETUP }, { "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh new file mode 100755 index 00000000000000..2df7fa83629301 --- /dev/null +++ b/t/t8100-git-survey.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +test_description='git survey' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +TEST_PASSES_SANITIZE_LEAK=0 +export TEST_PASSES_SANITIZE_LEAK + +. ./test-lib.sh + +test_expect_success 'git survey -h shows experimental warning' ' + test_expect_code 129 git survey -h 2>usage && + grep "EXPERIMENTAL!" usage +' + +test_done From c08fa91a2478d23fdbd7ddde232b48c5308b029c Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 29 Apr 2024 09:51:34 -0400 Subject: [PATCH 02/10] survey: add command line opts to select references By default we will scan all references in "refs/heads/", "refs/tags/" and "refs/remotes/". Add command line opts let the use ask for all refs or a subset of them and to include a detached HEAD. Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 34 +++++ builtin/survey.c | 247 +++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 9 ++ 3 files changed, 290 insertions(+) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index cdd1ec4358b8bb..c648ef704e3806 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -19,12 +19,46 @@ As repositories grow to "monorepo" size, certain data shapes can cause performance problems. `git-survey` attempts to measure and report on known problem areas. +Ref Selection and Reachable Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this first analysis phase, `git survey` will iterate over the set of +requested branches, tags, and other refs and treewalk over all of the +reachable commits, trees, and blobs and generate various statistics. + OPTIONS ------- --progress:: Show progress. This is automatically enabled when interactive. +Ref Selection +~~~~~~~~~~~~~ + +The following options control the set of refs that `git survey` will examine. +By default, `git survey` will look at tags, local branches, and remote refs. +If any of the following options are given, the default set is cleared and +only refs for the given options are added. + +--all-refs:: + Use all refs. This includes local branches, tags, remote refs, + notes, and stashes. This option overrides all of the following. + +--branches:: + Add local branches (`refs/heads/`) to the set. + +--tags:: + Add tags (`refs/tags/`) to the set. + +--remotes:: + Add remote branches (`refs/remote/`) to the set. + +--detached:: + Add HEAD to the set. + +--other:: + Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. + OUTPUT ------ diff --git a/builtin/survey.c b/builtin/survey.c index f80fd04a65f1c2..a70e6d640c109f 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -2,16 +2,55 @@ #include "builtin.h" #include "config.h" +#include "object.h" +#include "object-store-ll.h" #include "parse-options.h" +#include "progress.h" +#include "ref-filter.h" +#include "strvec.h" +#include "trace2.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), NULL, }; +struct survey_refs_wanted { + int want_all_refs; /* special override */ + + int want_branches; + int want_tags; + int want_remotes; + int want_detached; + int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ +}; + +static struct survey_refs_wanted default_ref_options = { + .want_all_refs = 1, +}; + struct survey_opts { int verbose; int show_progress; + struct survey_refs_wanted refs; +}; + +struct survey_report_ref_summary { + size_t refs_nr; + size_t branches_nr; + size_t remote_refs_nr; + size_t tags_nr; + size_t tags_annotated_nr; + size_t others_nr; + size_t unknown_nr; +}; + +/** + * This struct contains all of the information that needs to be printed + * at the end of the exploration of the repository and its references. + */ +struct survey_report { + struct survey_report_ref_summary refs; }; struct survey_context { @@ -19,8 +58,84 @@ struct survey_context { /* Options that control what is done. */ struct survey_opts opts; + + /* Info for output only. */ + struct survey_report report; + + /* + * The rest of the members are about enabling the activity + * of the 'git survey' command, including ref listings, object + * pointers, and progress. + */ + + struct progress *progress; + size_t progress_nr; + size_t progress_total; + + struct strvec refs; }; +static void clear_survey_context(struct survey_context *ctx) +{ + strvec_clear(&ctx->refs); +} + +/* + * After parsing the command line arguments, figure out which refs we + * should scan. + * + * If ANY were given in positive sense, then we ONLY include them and + * do not use the builtin values. + */ +static void fixup_refs_wanted(struct survey_context *ctx) +{ + struct survey_refs_wanted *rw = &ctx->opts.refs; + + /* + * `--all-refs` overrides and enables everything. + */ + if (rw->want_all_refs == 1) { + rw->want_branches = 1; + rw->want_tags = 1; + rw->want_remotes = 1; + rw->want_detached = 1; + rw->want_other = 1; + return; + } + + /* + * If none of the `--` were given, we assume all + * of the builtin unspecified values. + */ + if (rw->want_branches == -1 && + rw->want_tags == -1 && + rw->want_remotes == -1 && + rw->want_detached == -1 && + rw->want_other == -1) { + *rw = default_ref_options; + return; + } + + /* + * Since we only allow positive boolean values on the command + * line, we will only have true values where they specified + * a `--`. + * + * So anything that still has an unspecified value should be + * set to false. + */ + if (rw->want_branches == -1) + rw->want_branches = 0; + if (rw->want_tags == -1) + rw->want_tags = 0; + if (rw->want_remotes == -1) + rw->want_remotes = 0; + if (rw->want_detached == -1) + rw->want_detached = 0; + if (rw->want_other == -1) + rw->want_other = 0; +} + static int survey_load_config_cb(const char *var, const char *value, const struct config_context *cctx, void *pvoid) { @@ -43,18 +158,145 @@ static void survey_load_config(struct survey_context *ctx) git_config(survey_load_config_cb, ctx); } +static void do_load_refs(struct survey_context *ctx, + struct ref_array *ref_array) +{ + struct ref_filter filter = REF_FILTER_INIT; + struct ref_sorting *sorting; + struct string_list sorting_options = STRING_LIST_INIT_DUP; + + string_list_append(&sorting_options, "objectname"); + sorting = ref_sorting_options(&sorting_options); + + if (ctx->opts.refs.want_detached) + strvec_push(&ctx->refs, "HEAD"); + + if (ctx->opts.refs.want_all_refs) { + strvec_push(&ctx->refs, "refs/"); + } else { + if (ctx->opts.refs.want_branches) + strvec_push(&ctx->refs, "refs/heads/"); + if (ctx->opts.refs.want_tags) + strvec_push(&ctx->refs, "refs/tags/"); + if (ctx->opts.refs.want_remotes) + strvec_push(&ctx->refs, "refs/remotes/"); + if (ctx->opts.refs.want_other) { + strvec_push(&ctx->refs, "refs/notes/"); + strvec_push(&ctx->refs, "refs/stash/"); + } + } + + filter.name_patterns = ctx->refs.v; + filter.ignore_case = 0; + filter.match_as_path = 1; + + if (ctx->opts.show_progress) { + ctx->progress_total = 0; + ctx->progress = start_progress(_("Scanning refs..."), 0); + } + + filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK); + + if (ctx->opts.show_progress) { + ctx->progress_total = ref_array->nr; + display_progress(ctx->progress, ctx->progress_total); + } + + ref_array_sort(sorting, ref_array); + + stop_progress(&ctx->progress); + ref_filter_clear(&filter); + ref_sorting_release(sorting); +} + +/* + * The REFS phase: + * + * Load the set of requested refs and assess them for scalablity problems. + * Use that set to start a treewalk to all reachable objects and assess + * them. + * + * This data will give us insights into the repository itself (the number + * of refs, the size and shape of the DAG, the number and size of the + * objects). + * + * Theoretically, this data is independent of the on-disk representation + * (e.g. independent of packing concerns). + */ +static void survey_phase_refs(struct survey_context *ctx) +{ + struct ref_array ref_array = { 0 }; + + trace2_region_enter("survey", "phase/refs", ctx->repo); + do_load_refs(ctx, &ref_array); + + ctx->report.refs.refs_nr = ref_array.nr; + for (size_t i = 0; i < ref_array.nr; i++) { + unsigned long size; + struct ref_array_item *item = ref_array.items[i]; + + switch (item->kind) { + case FILTER_REFS_TAGS: + ctx->report.refs.tags_nr++; + if (oid_object_info(ctx->repo, + &item->objectname, + &size) == OBJ_TAG) + ctx->report.refs.tags_annotated_nr++; + break; + + case FILTER_REFS_BRANCHES: + ctx->report.refs.branches_nr++; + break; + + case FILTER_REFS_REMOTES: + ctx->report.refs.remote_refs_nr++; + break; + + case FILTER_REFS_OTHERS: + ctx->report.refs.others_nr++; + break; + + default: + ctx->report.refs.unknown_nr++; + break; + } + } + + trace2_region_leave("survey", "phase/refs", ctx->repo); + + ref_array_clear(&ref_array); +} + int cmd_survey(int argc, const char **argv, const char *prefix) { static struct survey_context ctx = { .opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ + + .refs.want_all_refs = -1, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, }, + .refs = STRVEC_INIT, }; static struct option survey_options[] = { OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + + OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &ctx.opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &ctx.opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &ctx.opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + OPT_END(), }; @@ -71,5 +313,10 @@ int cmd_survey(int argc, const char **argv, const char *prefix) if (ctx.opts.show_progress < 0) ctx.opts.show_progress = isatty(2); + fixup_refs_wanted(&ctx); + + survey_phase_refs(&ctx); + + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 2df7fa83629301..6656cf20bf7a17 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -15,4 +15,13 @@ test_expect_success 'git survey -h shows experimental warning' ' grep "EXPERIMENTAL!" usage ' +test_expect_success 'create a semi-interesting repo' ' + test_commit_bulk 10 +' + +test_expect_success 'git survey (default)' ' + git survey >out 2>err && + test_line_count = 0 err +' + test_done From 2c0755d9ffd6ccd7a37357d4cea06348f51e163f Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 15:58:32 -0400 Subject: [PATCH 03/10] survey: start pretty printing data in table form When 'git survey' provides information to the user, this will be presented in one of two formats: plaintext and JSON. The JSON implementation will be delayed until the functionality is complete for the plaintext format. The most important parts of the plaintext format are headers specifying the different sections of the report and tables providing concreted data. Create a custom table data structure that allows specifying a list of strings for the row values. When printing the table, check each column for the maximum width so we can create a table of the correct size from the start. The table structure is designed to be flexible to the different kinds of output that will be implemented in future changes. Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 7 ++ builtin/survey.c | 157 +++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 18 +++- 3 files changed, 181 insertions(+), 1 deletion(-) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index c648ef704e3806..25d10781831c99 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -65,6 +65,13 @@ OUTPUT By default, `git survey` will print information about the repository in a human-readable format that includes overviews and tables. +References Summary +~~~~~~~~~~~~~~~~~~ + +The references summary includes a count of each kind of reference, +including branches, remote refs, and tags (split by "all" and +"annotated"). + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/survey.c b/builtin/survey.c index a70e6d640c109f..ffb6836de0aebb 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -7,6 +7,7 @@ #include "parse-options.h" #include "progress.h" #include "ref-filter.h" +#include "strbuf.h" #include "strvec.h" #include "trace2.h" @@ -80,6 +81,160 @@ static void clear_survey_context(struct survey_context *ctx) strvec_clear(&ctx->refs); } +struct survey_table { + const char *table_name; + struct strvec header; + struct strvec *rows; + size_t rows_nr; + size_t rows_alloc; +}; + +#define SURVEY_TABLE_INIT { \ + .header = STRVEC_INIT, \ +} + +static void clear_table(struct survey_table *table) +{ + strvec_clear(&table->header); + for (size_t i = 0; i < table->rows_nr; i++) + strvec_clear(&table->rows[i]); + free(table->rows); +} + +static void insert_table_rowv(struct survey_table *table, ...) +{ + va_list ap; + char *arg; + ALLOC_GROW(table->rows, table->rows_nr + 1, table->rows_alloc); + + memset(&table->rows[table->rows_nr], 0, sizeof(struct strvec)); + + va_start(ap, table); + while ((arg = va_arg(ap, char *))) + strvec_push(&table->rows[table->rows_nr], arg); + va_end(ap); + + table->rows_nr++; +} + +#define SECTION_SEGMENT "========================================" +#define SECTION_SEGMENT_LEN 40 +static const char *section_line = SECTION_SEGMENT + SECTION_SEGMENT + SECTION_SEGMENT + SECTION_SEGMENT; +static const size_t section_len = 4 * SECTION_SEGMENT_LEN; + +static void print_table_title(const char *name, size_t *widths, size_t nr) +{ + size_t width = 3 * (nr - 1); + + for (size_t i = 0; i < nr; i++) + width += widths[i]; + + if (width > section_len) + width = section_len; + + printf("\n%s\n%.*s\n", name, (int)width, section_line); +} + +static void print_row_plaintext(struct strvec *row, size_t *widths) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < row->nr; i++) { + const char *str = row->v[i]; + size_t len = strlen(str); + if (i) + strbuf_add(&line, " | ", 3); + strbuf_addchars(&line, ' ', widths[i] - len); + strbuf_add(&line, str, len); + } + printf("%s\n", line.buf); +} + +static void print_divider_plaintext(size_t *widths, size_t nr) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < nr; i++) { + if (i) + strbuf_add(&line, "-+-", 3); + strbuf_addchars(&line, '-', widths[i]); + } + printf("%s\n", line.buf); +} + +static void print_table_plaintext(struct survey_table *table) +{ + size_t *column_widths; + size_t columns_nr = table->header.nr; + CALLOC_ARRAY(column_widths, columns_nr); + + for (size_t i = 0; i < columns_nr; i++) { + column_widths[i] = strlen(table->header.v[i]); + + for (size_t j = 0; j < table->rows_nr; j++) { + size_t rowlen = strlen(table->rows[j].v[i]); + if (column_widths[i] < rowlen) + column_widths[i] = rowlen; + } + } + + print_table_title(table->table_name, column_widths, columns_nr); + print_row_plaintext(&table->header, column_widths); + print_divider_plaintext(column_widths, columns_nr); + + for (size_t j = 0; j < table->rows_nr; j++) + print_row_plaintext(&table->rows[j], column_widths); + + free(column_widths); +} + +static void survey_report_plaintext_refs(struct survey_context *ctx) +{ + struct survey_report_ref_summary *refs = &ctx->report.refs; + struct survey_table table = SURVEY_TABLE_INIT; + + table.table_name = _("REFERENCES SUMMARY"); + + strvec_push(&table.header, _("Ref Type")); + strvec_push(&table.header, _("Count")); + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_branches) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->branches_nr); + insert_table_rowv(&table, _("Branches"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_remotes) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->remote_refs_nr); + insert_table_rowv(&table, _("Remote refs"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_tags) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->tags_nr); + insert_table_rowv(&table, _("Tags (all)"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->tags_annotated_nr); + insert_table_rowv(&table, _("Tags (annotated)"), fmt, NULL); + free(fmt); + } + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_plaintext(struct survey_context *ctx) +{ + printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); + printf("-----------------------------------------------------\n"); + survey_report_plaintext_refs(ctx); +} + /* * After parsing the command line arguments, figure out which refs we * should scan. @@ -317,6 +472,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix) survey_phase_refs(&ctx); + survey_report_plaintext(&ctx); + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 6656cf20bf7a17..b76064b2a867ac 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -21,7 +21,23 @@ test_expect_success 'create a semi-interesting repo' ' test_expect_success 'git survey (default)' ' git survey >out 2>err && - test_line_count = 0 err + test_line_count = 0 err && + + tr , " " >expect <<-EOF && + GIT SURVEY for "$(pwd)" + ----------------------------------------------------- + + REFERENCES SUMMARY + ======================== + , Ref Type | Count + -----------------+------ + , Branches | 1 + Remote refs | 0 + Tags (all) | 0 + Tags (annotated) | 0 + EOF + + test_cmp expect out ' test_done From 9e2f0af39502d6a7c49838ed09d7f629b74f92cb Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 20:33:47 -0400 Subject: [PATCH 04/10] survey: add object count summary At the moment, nothing is obvious about the reason for the use of the path-walk API, but this will become more prevelant in future iterations. For now, use the path-walk API to sum up the counts of each kind of object. For example, this is the reachable object summary output for my local repo: REACHABLE OBJECT SUMMARY ======================== Object Type | Count ------------+------- Tags | 1343 Commits | 179344 Trees | 314350 Blobs | 184030 Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 6 ++ builtin/survey.c | 131 +++++++++++++++++++++++++++++++++-- t/t8100-git-survey.sh | 23 ++++-- 3 files changed, 149 insertions(+), 11 deletions(-) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index 25d10781831c99..894c7be3053eb9 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -72,6 +72,12 @@ The references summary includes a count of each kind of reference, including branches, remote refs, and tags (split by "all" and "annotated"). +Reachable Object Summary +~~~~~~~~~~~~~~~~~~~~~~~~ + +The reachable object summary shows the total number of each kind of Git +object, including tags, commits, trees, and blobs. + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/survey.c b/builtin/survey.c index ffb6836de0aebb..c1240f2c356621 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -2,13 +2,20 @@ #include "builtin.h" #include "config.h" +#include "environment.h" +#include "hex.h" #include "object.h" +#include "object-name.h" #include "object-store-ll.h" #include "parse-options.h" +#include "path-walk.h" #include "progress.h" #include "ref-filter.h" +#include "refs.h" +#include "revision.h" #include "strbuf.h" #include "strvec.h" +#include "tag.h" #include "trace2.h" static const char * const survey_usage[] = { @@ -46,12 +53,20 @@ struct survey_report_ref_summary { size_t unknown_nr; }; +struct survey_report_object_summary { + size_t commits_nr; + size_t tags_nr; + size_t trees_nr; + size_t blobs_nr; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. */ struct survey_report { struct survey_report_ref_summary refs; + struct survey_report_object_summary reachable_objects; }; struct survey_context { @@ -74,10 +89,12 @@ struct survey_context { size_t progress_total; struct strvec refs; + struct ref_array ref_array; }; static void clear_survey_context(struct survey_context *ctx) { + ref_array_clear(&ctx->ref_array); strvec_clear(&ctx->refs); } @@ -128,10 +145,14 @@ static const size_t section_len = 4 * SECTION_SEGMENT_LEN; static void print_table_title(const char *name, size_t *widths, size_t nr) { size_t width = 3 * (nr - 1); + size_t min_width = strlen(name); for (size_t i = 0; i < nr; i++) width += widths[i]; + if (width < min_width) + width = min_width; + if (width > section_len) width = section_len; @@ -228,11 +249,43 @@ static void survey_report_plaintext_refs(struct survey_context *ctx) clear_table(&table); } +static void survey_report_plaintext_reachable_object_summary(struct survey_context *ctx) +{ + struct survey_report_object_summary *objs = &ctx->report.reachable_objects; + struct survey_table table = SURVEY_TABLE_INIT; + char *fmt; + + table.table_name = _("REACHABLE OBJECT SUMMARY"); + + strvec_push(&table.header, _("Object Type")); + strvec_push(&table.header, _("Count")); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->tags_nr); + insert_table_rowv(&table, _("Tags"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->commits_nr); + insert_table_rowv(&table, _("Commits"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->trees_nr); + insert_table_rowv(&table, _("Trees"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->blobs_nr); + insert_table_rowv(&table, _("Blobs"), fmt, NULL); + free(fmt); + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); + survey_report_plaintext_reachable_object_summary(ctx); } /* @@ -380,15 +433,13 @@ static void do_load_refs(struct survey_context *ctx, */ static void survey_phase_refs(struct survey_context *ctx) { - struct ref_array ref_array = { 0 }; - trace2_region_enter("survey", "phase/refs", ctx->repo); - do_load_refs(ctx, &ref_array); + do_load_refs(ctx, &ctx->ref_array); - ctx->report.refs.refs_nr = ref_array.nr; - for (size_t i = 0; i < ref_array.nr; i++) { + ctx->report.refs.refs_nr = ctx->ref_array.nr; + for (size_t i = 0; i < ctx->ref_array.nr; i++) { unsigned long size; - struct ref_array_item *item = ref_array.items[i]; + struct ref_array_item *item = ctx->ref_array.items[i]; switch (item->kind) { case FILTER_REFS_TAGS: @@ -418,8 +469,72 @@ static void survey_phase_refs(struct survey_context *ctx) } trace2_region_leave("survey", "phase/refs", ctx->repo); +} + +static void increment_object_counts( + struct survey_report_object_summary *summary, + enum object_type type, + size_t nr) +{ + switch (type) { + case OBJ_COMMIT: + summary->commits_nr += nr; + break; - ref_array_clear(&ref_array); + case OBJ_TREE: + summary->trees_nr += nr; + break; + + case OBJ_BLOB: + summary->blobs_nr += nr; + break; + + case OBJ_TAG: + summary->tags_nr += nr; + break; + + default: + break; + } +} + +static int survey_objects_path_walk_fn(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct survey_context *ctx = data; + + increment_object_counts(&ctx->report.reachable_objects, + type, oids->nr); + + return 0; +} + +static void survey_phase_objects(struct survey_context *ctx) +{ + struct rev_info revs = REV_INFO_INIT; + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int add_flags = 0; + + trace2_region_enter("survey", "phase/objects", ctx->repo); + + info.revs = &revs; + info.path_fn = survey_objects_path_walk_fn; + info.path_fn_data = ctx; + + repo_init_revisions(ctx->repo, &revs, ""); + revs.tag_objects = 1; + + for (size_t i = 0; i < ctx->ref_array.nr; i++) { + struct ref_array_item *item = ctx->ref_array.items[i]; + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + } + + walk_objects_by_path(&info); + + release_revisions(&revs); + trace2_region_leave("survey", "phase/objects", ctx->repo); } int cmd_survey(int argc, const char **argv, const char *prefix) @@ -472,6 +587,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix) survey_phase_refs(&ctx); + survey_phase_objects(&ctx); + survey_report_plaintext(&ctx); clear_survey_context(&ctx); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index b76064b2a867ac..7a37da1bb2dadc 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -16,11 +16,17 @@ test_expect_success 'git survey -h shows experimental warning' ' ' test_expect_success 'create a semi-interesting repo' ' - test_commit_bulk 10 + test_commit_bulk 10 && + git tag -a -m one one HEAD~5 && + git tag -a -m two two HEAD~3 && + git tag -a -m three three two && + git tag -a -m four four three && + git update-ref -d refs/tags/three && + git update-ref -d refs/tags/two ' test_expect_success 'git survey (default)' ' - git survey >out 2>err && + git survey --all-refs >out 2>err && test_line_count = 0 err && tr , " " >expect <<-EOF && @@ -33,8 +39,17 @@ test_expect_success 'git survey (default)' ' -----------------+------ , Branches | 1 Remote refs | 0 - Tags (all) | 0 - Tags (annotated) | 0 + Tags (all) | 2 + Tags (annotated) | 2 + + REACHABLE OBJECT SUMMARY + ======================== + Object Type | Count + ------------+------ + Tags | 4 + Commits | 10 + Trees | 10 + Blobs | 10 EOF test_cmp expect out From 947c2c5155cd57ac9f751821d7c01c19e153d145 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 20:58:35 -0400 Subject: [PATCH 05/10] survey: summarize total sizes by object type Now that we have explored objects by count, we can expand that a bit more to summarize the data for the on-disk and inflated size of those objects. This information is helpful for diagnosing both why disk space (and perhaps clone or fetch times) is growing but also why certain operations are slow because the inflated size of the abstract objects that must be processed is so large. Signed-off-by: Derrick Stolee --- builtin/survey.c | 132 ++++++++++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 29 ++++++++++ 2 files changed, 161 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index c1240f2c356621..792624be6b1de4 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -60,6 +60,19 @@ struct survey_report_object_summary { size_t blobs_nr; }; +/** + * For some category given by 'label', count the number of objects + * that match that label along with the on-disk size and the size + * after decompressing (both with delta bases and zlib). + */ +struct survey_report_object_size_summary { + char *label; + size_t nr; + size_t disk_size; + size_t inflated_size; + size_t num_missing; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. @@ -67,8 +80,16 @@ struct survey_report_object_summary { struct survey_report { struct survey_report_ref_summary refs; struct survey_report_object_summary reachable_objects; + + struct survey_report_object_size_summary *by_type; }; +#define REPORT_TYPE_COMMIT 0 +#define REPORT_TYPE_TREE 1 +#define REPORT_TYPE_BLOB 2 +#define REPORT_TYPE_TAG 3 +#define REPORT_TYPE_COUNT 4 + struct survey_context { struct repository *repo; @@ -280,12 +301,48 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte clear_table(&table); } +static void survey_report_object_sizes(const char *title, + const char *categories, + struct survey_report_object_size_summary *summary, + size_t summary_nr) +{ + struct survey_table table = SURVEY_TABLE_INIT; + table.table_name = title; + + strvec_push(&table.header, categories); + strvec_push(&table.header, _("Count")); + strvec_push(&table.header, _("Disk Size")); + strvec_push(&table.header, _("Inflated Size")); + + for (size_t i = 0; i < summary_nr; i++) { + char *label_str = xstrdup(summary[i].label); + char *nr_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].nr); + char *disk_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].disk_size); + char *inflate_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].inflated_size); + + insert_table_rowv(&table, label_str, nr_str, + disk_str, inflate_str, NULL); + + free(label_str); + free(nr_str); + free(disk_str); + free(inflate_str); + } + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); survey_report_plaintext_reachable_object_summary(ctx); + survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"), + _("Object Type"), + ctx->report.by_type, + REPORT_TYPE_COUNT); } /* @@ -498,6 +555,68 @@ static void increment_object_counts( } } +static void increment_totals(struct survey_context *ctx, + struct oid_array *oids, + struct survey_report_object_size_summary *summary) +{ + for (size_t i = 0; i < oids->nr; i++) { + struct object_info oi = OBJECT_INFO_INIT; + unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH; + unsigned long object_length = 0; + off_t disk_sizep = 0; + enum object_type type; + + oi.typep = &type; + oi.sizep = &object_length; + oi.disk_sizep = &disk_sizep; + + if (oid_object_info_extended(ctx->repo, &oids->oid[i], + &oi, oi_flags) < 0) { + summary->num_missing++; + } else { + summary->nr++; + summary->disk_size += disk_sizep; + summary->inflated_size += object_length; + } + } +} + +static void increment_object_totals(struct survey_context *ctx, + struct oid_array *oids, + enum object_type type) +{ + struct survey_report_object_size_summary *total; + struct survey_report_object_size_summary summary = { 0 }; + + increment_totals(ctx, oids, &summary); + + switch (type) { + case OBJ_COMMIT: + total = &ctx->report.by_type[REPORT_TYPE_COMMIT]; + break; + + case OBJ_TREE: + total = &ctx->report.by_type[REPORT_TYPE_TREE]; + break; + + case OBJ_BLOB: + total = &ctx->report.by_type[REPORT_TYPE_BLOB]; + break; + + case OBJ_TAG: + total = &ctx->report.by_type[REPORT_TYPE_TAG]; + break; + + default: + BUG("No other type allowed"); + } + + total->nr += summary.nr; + total->disk_size += summary.disk_size; + total->inflated_size += summary.inflated_size; + total->num_missing += summary.num_missing; +} + static int survey_objects_path_walk_fn(const char *path, struct oid_array *oids, enum object_type type, @@ -507,10 +626,20 @@ static int survey_objects_path_walk_fn(const char *path, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); + increment_object_totals(ctx, oids, type); return 0; } +static void initialize_report(struct survey_context *ctx) +{ + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); + ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); + ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); + ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags")); +} + static void survey_phase_objects(struct survey_context *ctx) { struct rev_info revs = REV_INFO_INIT; @@ -523,12 +652,15 @@ static void survey_phase_objects(struct survey_context *ctx) info.path_fn = survey_objects_path_walk_fn; info.path_fn_data = ctx; + initialize_report(ctx); + repo_init_revisions(ctx->repo, &revs, ""); revs.tag_objects = 1; for (size_t i = 0; i < ctx->ref_array.nr; i++) { struct ref_array_item *item = ctx->ref_array.items[i]; add_pending_oid(&revs, NULL, &item->objectname, add_flags); + display_progress(ctx->progress, ++(ctx->progress_nr)); } walk_objects_by_path(&info); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 7a37da1bb2dadc..e738d6421a3224 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -29,6 +29,26 @@ test_expect_success 'git survey (default)' ' git survey --all-refs >out 2>err && test_line_count = 0 err && + test_oid_cache <<-EOF && + commits_size_on_disk sha1: 1523 + commits_size_on_disk sha256: 1811 + + commits_size sha1: 2153 + commits_size sha256: 2609 + + trees_size_on_disk sha1: 495 + trees_size_on_disk sha256: 635 + + trees_size sha1: 1706 + trees_size sha256: 2366 + + tags_size sha1: 528 + tags_size sha256: 624 + + tags_size_on_disk sha1: 510 + tags_size_on_disk sha256: 569 + EOF + tr , " " >expect <<-EOF && GIT SURVEY for "$(pwd)" ----------------------------------------------------- @@ -50,6 +70,15 @@ test_expect_success 'git survey (default)' ' Commits | 10 Trees | 10 Blobs | 10 + + TOTAL OBJECT SIZES BY TYPE + =============================================== + Object Type | Count | Disk Size | Inflated Size + ------------+-------+-----------+-------------- + Commits | 10 | $(test_oid commits_size_on_disk) | $(test_oid commits_size) + Trees | 10 | $(test_oid trees_size_on_disk) | $(test_oid trees_size) + Blobs | 10 | 191 | 101 + Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size) EOF test_cmp expect out From 4e428263ce9fc2d3f9e68607d0c8e3b9fbb78d65 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 21:21:54 -0400 Subject: [PATCH 06/10] survey: show progress during object walk Signed-off-by: Derrick Stolee --- builtin/survey.c | 14 ++++++++++++++ t/t8100-git-survey.sh | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 792624be6b1de4..c90a73ba85caf2 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -628,6 +628,9 @@ static int survey_objects_path_walk_fn(const char *path, type, oids->nr); increment_object_totals(ctx, oids, type); + ctx->progress_nr += oids->nr; + display_progress(ctx->progress, ctx->progress_nr); + return 0; } @@ -657,13 +660,24 @@ static void survey_phase_objects(struct survey_context *ctx) repo_init_revisions(ctx->repo, &revs, ""); revs.tag_objects = 1; + ctx->progress_nr = 0; + ctx->progress_total = ctx->ref_array.nr; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Preparing object walk"), + ctx->progress_total); for (size_t i = 0; i < ctx->ref_array.nr; i++) { struct ref_array_item *item = ctx->ref_array.items[i]; add_pending_oid(&revs, NULL, &item->objectname, add_flags); display_progress(ctx->progress, ++(ctx->progress_nr)); } + stop_progress(&ctx->progress); + ctx->progress_nr = 0; + ctx->progress_total = 0; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Walking objects"), 0); walk_objects_by_path(&info); + stop_progress(&ctx->progress); release_revisions(&revs); trace2_region_leave("survey", "phase/objects", ctx->repo); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index e738d6421a3224..6c2867c11c323c 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -25,6 +25,11 @@ test_expect_success 'create a semi-interesting repo' ' git update-ref -d refs/tags/two ' +test_expect_success 'git survey --progress' ' + GIT_PROGRESS_DELAY=0 git survey --all-refs --progress >out 2>err && + grep "Preparing object walk" err +' + test_expect_success 'git survey (default)' ' git survey --all-refs >out 2>err && test_line_count = 0 err && From 2a99b7c57ca7d26d7499a990b06cac9473e0b181 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 22:35:06 -0400 Subject: [PATCH 07/10] survey: add ability to track prioritized lists In future changes, we will make use of these methods. The intention is to keep track of the top contributors according to some metric. We don't want to store all of the entries and do a sort at the end, so track a constant-size table and remove rows that get pushed out depending on the chosen sorting algorithm. Co-authored-by: Jeff Hostetler Signed-off-by; Jeff Hostetler Signed-off-by: Derrick Stolee --- builtin/survey.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index c90a73ba85caf2..ac68ac942b3c71 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -73,6 +73,119 @@ struct survey_report_object_size_summary { size_t num_missing; }; +typedef int (*survey_top_cmp)(void *v1, void *v2); + +MAYBE_UNUSED +static int cmp_by_nr(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->nr < s2->nr) + return -1; + if (s1->nr > s2->nr) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_disk_size(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->disk_size < s2->disk_size) + return -1; + if (s1->disk_size > s2->disk_size) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_inflated_size(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->inflated_size < s2->inflated_size) + return -1; + if (s1->inflated_size > s2->inflated_size) + return 1; + return 0; +} + +/** + * Store a list of "top" categories by some sorting function. When + * inserting a new category, reorder the list and free the one that + * got ejected (if any). + */ +struct survey_report_top_table { + const char *name; + survey_top_cmp cmp_fn; + size_t nr; + size_t alloc; + + /** + * 'data' stores an array of structs and must be cast into + * the proper array type before evaluating an index. + */ + void *data; +}; + +MAYBE_UNUSED +static void init_top_sizes(struct survey_report_top_table *top, + size_t limit, const char *name, + survey_top_cmp cmp) +{ + struct survey_report_object_size_summary *sz_array; + + top->name = name; + top->cmp_fn = cmp; + top->alloc = limit; + top->nr = 0; + + CALLOC_ARRAY(sz_array, limit); + top->data = sz_array; +} + +MAYBE_UNUSED +static void clear_top_sizes(struct survey_report_top_table *top) +{ + struct survey_report_object_size_summary *sz_array = top->data; + + for (size_t i = 0; i < top->nr; i++) + free(sz_array[i].label); + free(sz_array); +} + +MAYBE_UNUSED +static void maybe_insert_into_top_size(struct survey_report_top_table *top, + struct survey_report_object_size_summary *summary) +{ + struct survey_report_object_size_summary *sz_array = top->data; + size_t pos = top->nr; + + /* Compare against list from the bottom. */ + while (pos > 0 && top->cmp_fn(&sz_array[pos - 1], summary) < 0) + pos--; + + /* Not big enough! */ + if (pos >= top->alloc) + return; + + /* We need to shift the data. */ + if (top->nr == top->alloc) + free(sz_array[top->nr - 1].label); + else + top->nr++; + + for (size_t i = top->nr - 1; i > pos; i--) + memcpy(&sz_array[i], &sz_array[i - 1], sizeof(*sz_array)); + + memcpy(&sz_array[pos], summary, sizeof(*summary)); + sz_array[pos].label = xstrdup(summary->label); +} + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. From af8bd64c1d85cd05c0c2b14a8d293697482ebd89 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 22:35:40 -0400 Subject: [PATCH 08/10] survey: add report of "largest" paths Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee --- builtin/survey.c | 77 +++++++++++++++++++++++++++++++++++++++---- t/t8100-git-survey.sh | 12 ++++++- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index ac68ac942b3c71..4aadf7dfc3534a 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -75,7 +75,6 @@ struct survey_report_object_size_summary { typedef int (*survey_top_cmp)(void *v1, void *v2); -MAYBE_UNUSED static int cmp_by_nr(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2) return 0; } -MAYBE_UNUSED static int cmp_by_disk_size(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2) return 0; } -MAYBE_UNUSED static int cmp_by_inflated_size(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -132,7 +129,6 @@ struct survey_report_top_table { void *data; }; -MAYBE_UNUSED static void init_top_sizes(struct survey_report_top_table *top, size_t limit, const char *name, survey_top_cmp cmp) @@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top) free(sz_array); } -MAYBE_UNUSED static void maybe_insert_into_top_size(struct survey_report_top_table *top, struct survey_report_object_size_summary *summary) { @@ -195,6 +190,10 @@ struct survey_report { struct survey_report_object_summary reachable_objects; struct survey_report_object_size_summary *by_type; + + struct survey_report_top_table *top_paths_by_count; + struct survey_report_top_table *top_paths_by_disk; + struct survey_report_top_table *top_paths_by_inflate; }; #define REPORT_TYPE_COMMIT 0 @@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title, clear_table(&table); } +static void survey_report_plaintext_sorted_size( + struct survey_report_top_table *top) +{ + survey_report_object_sizes(top->name, _("Path"), + top->data, top->nr); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); @@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx) _("Object Type"), ctx->report.by_type, REPORT_TYPE_COUNT); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); } /* @@ -696,7 +717,8 @@ static void increment_totals(struct survey_context *ctx, static void increment_object_totals(struct survey_context *ctx, struct oid_array *oids, - enum object_type type) + enum object_type type, + const char *path) { struct survey_report_object_size_summary *total; struct survey_report_object_size_summary summary = { 0 }; @@ -728,6 +750,27 @@ static void increment_object_totals(struct survey_context *ctx, total->disk_size += summary.disk_size; total->inflated_size += summary.inflated_size; total->num_missing += summary.num_missing; + + if (type == OBJ_TREE || type == OBJ_BLOB) { + int index = type == OBJ_TREE ? + REPORT_TYPE_TREE : REPORT_TYPE_BLOB; + struct survey_report_top_table *top; + + /* + * Temporarily store (const char *) here, but it will + * be duped if inserted and will not be freed. + */ + summary.label = (char *)path; + + top = ctx->report.top_paths_by_count; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_disk; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_inflate; + maybe_insert_into_top_size(&top[index], &summary); + } } static int survey_objects_path_walk_fn(const char *path, @@ -739,7 +782,7 @@ static int survey_objects_path_walk_fn(const char *path, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); - increment_object_totals(ctx, oids, type); + increment_object_totals(ctx, oids, type, path); ctx->progress_nr += oids->nr; display_progress(ctx->progress, ctx->progress_nr); @@ -749,11 +792,31 @@ static int survey_objects_path_walk_fn(const char *path, static void initialize_report(struct survey_context *ctx) { + const int top_limit = 100; + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags")); + + CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + + CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + + CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); } static void survey_phase_objects(struct survey_context *ctx) diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 6c2867c11c323c..8c6edfcae0c6c2 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -86,7 +86,17 @@ test_expect_success 'git survey (default)' ' Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size) EOF - test_cmp expect out + lines=$(wc -l out-trimmed && + test_cmp expect out-trimmed && + + for type in "DIRECTORIES" "FILES" + do + for metric in "COUNT" "DISK SIZE" "INFLATED SIZE" + do + grep "TOP $type BY $metric" out || return 1 + done || return 1 + done ' test_done From f18c0c215b29c138434339669c4eb1852d3f3fbc Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 23 Sep 2024 15:38:25 -0400 Subject: [PATCH 09/10] survey: add --top= option and config The 'git survey' builtin provides several detail tables, such as "top files by on-disk size". The size of these tables defaults to 100, currently. Allow the user to specify this number via a new --top= option or the new survey.top config key. Signed-off-by: Derrick Stolee --- Documentation/config/survey.txt | 3 +++ builtin/survey.c | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt index c1b0f852a1250e..9e594a2092f225 100644 --- a/Documentation/config/survey.txt +++ b/Documentation/config/survey.txt @@ -8,4 +8,7 @@ survey.*:: This boolean value implies the `--[no-]verbose` option. progress:: This boolean value implies the `--[no-]progress` option. + top:: + This integer value implies `--top=`, specifying the + number of entries in the detail tables. -- diff --git a/builtin/survey.c b/builtin/survey.c index 4aadf7dfc3534a..b52ece6c2427a6 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -40,6 +40,7 @@ static struct survey_refs_wanted default_ref_options = { struct survey_opts { int verbose; int show_progress; + int top_nr; struct survey_refs_wanted refs; }; @@ -548,6 +549,10 @@ static int survey_load_config_cb(const char *var, const char *value, ctx->opts.show_progress = git_config_bool(var, value); return 0; } + if (!strcmp(var, "survey.top")) { + ctx->opts.top_nr = git_config_bool(var, value); + return 0; + } return git_default_config(var, value, cctx, pvoid); } @@ -792,8 +797,6 @@ static int survey_objects_path_walk_fn(const char *path, static void initialize_report(struct survey_context *ctx) { - const int top_limit = 100; - CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); @@ -802,21 +805,21 @@ static void initialize_report(struct survey_context *ctx) CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + ctx->opts.top_nr, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + ctx->opts.top_nr, _("TOP FILES BY COUNT"), cmp_by_nr); CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + ctx->opts.top_nr, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + ctx->opts.top_nr, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + ctx->opts.top_nr, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); + ctx->opts.top_nr, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); } static void survey_phase_objects(struct survey_context *ctx) @@ -865,6 +868,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix) .opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ + .top_nr = 100, .refs.want_all_refs = -1, @@ -880,6 +884,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix) static struct option survey_options[] = { OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + OPT_INTEGER('n', "top", &ctx.opts.top_nr, + N_("number of entries to include in detail tables")), OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), From d28dc5bbabc6c35f19e50b94628271c128a9481c Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 1 Jul 2024 23:28:45 +0200 Subject: [PATCH 10/10] survey: clearly note the experimental nature in the output While this command is definitely something we _want_, chances are that upstreaming this will require substantial changes. We still want to be able to experiment with this before that, to focus on what we need out of this command: To assist with diagnosing issues with large repositories, as well as to help monitoring the growth and the associated painpoints of such repositories. To that end, we are about to integrate this command into `microsoft/git`, to get the tool into the hands of users who need it most, with the idea to iterate in close collaboration between these users and the developers familar with Git's internals. However, we will definitely want to avoid letting anybody have the impression that this command, its exact inner workings, as well as its output format, are anywhere close to stable. To make that fact utterly clear (and thereby protect the freedom to iterate and innovate freely before upstreaming the command), let's mark its output as experimental in all-caps, as the first thing we do. Signed-off-by: Johannes Schindelin --- builtin/survey.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index b52ece6c2427a6..59023fd365b93a 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -17,6 +17,7 @@ #include "strvec.h" #include "tag.h" #include "trace2.h" +#include "color.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), @@ -901,6 +902,11 @@ int cmd_survey(int argc, const char **argv, const char *prefix) if (argc == 2 && !strcmp(argv[1], "-h")) usage_with_options(survey_usage, survey_options); + if (isatty(2)) + color_fprintf_ln(stderr, + want_color_fd(2, GIT_COLOR_AUTO) ? GIT_COLOR_YELLOW : "", + "(THIS IS EXPERIMENTAL, EXPECT THE OUTPUT FORMAT TO CHANGE!)"); + ctx.repo = the_repository; prepare_repo_settings(ctx.repo);