From d3f5b1803aac378ec2dd18c93020e2a8f442e680 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Tue, 15 Oct 2024 11:38:20 +0200 Subject: [PATCH] feat: annex interfaces and initialization `Repo` and `Worktree` received dedicated, optional support for annexes, and their initialization. The concept is is substantially different from that implemented in legacy DataLad. There, an `AnnexRepo` class was derived from a `GitRepo` class and extended and overwrote individual methods, forming a relatively high-level API. Here, any `Repo` or `Worktree` can have an optional annex. All operations related to that annex are implemented in dedicated handlers that are fully independent of a `Repo` or `Worktree` instance. The aim is to reduce the complex interdependencies that cripple the validity and robustness of the legacy implementations. Git is used more "directly", even in git-annex repositories and git-annex is used more directly, and agnostic of the context and demands of high-level operations. This has important consequences. For example, a method like the legacy `AnnexRepo.save()` cannot exist, because there is no `GitRepo.save` that it can override and "do the right thing" for a git-annex repository. This is acceptable here, because the API provided in this module is not targeting the level of compound/convenience operations like `save()` that aim to alleviate a developer's required expertise. Instead, the aim here is to provide primitives that can be used in a higher-level (possibly function-based) API. --- datalad_core/repo/__init__.py | 30 ++++- datalad_core/repo/annex.py | 149 +++++++++++++++++++++++ datalad_core/repo/repo.py | 75 +++++++++++- datalad_core/repo/tests/test_annex.py | 116 ++++++++++++++++++ datalad_core/repo/tests/test_worktree.py | 2 +- datalad_core/repo/worktree.py | 36 +++++- 6 files changed, 402 insertions(+), 6 deletions(-) create mode 100644 datalad_core/repo/annex.py create mode 100644 datalad_core/repo/tests/test_annex.py diff --git a/datalad_core/repo/__init__.py b/datalad_core/repo/__init__.py index a5cb526..8454ad1 100644 --- a/datalad_core/repo/__init__.py +++ b/datalad_core/repo/__init__.py @@ -1,8 +1,24 @@ """Repository and worktree representations -The classes in this module implement the "flyweight" pattern. This means that, -within the same process, creating instances of :class:`Repo` and -:class:`Worktree` always yields the same instance for the same path location. +This module provides the essential types for working with Git/git-annex +repositories and DataLad datasets. + +For most use cases, the :class:`Worktree` class is the main entrypoint. It can +be pointed to an existing Git repository checkout, or a new repository can be +created via its :meth:`Worktree.init_at` class method. Access to operations on +the underlying Git repository (that may be shared by additional worktrees) is +possible via the :attr:`Worktree.repo` attribute. An optional (git-)annex can +be initialized (:meth:`Worktree.init_annex`), and accessed via +:attr:`Worktree.annex`. + +Working with bare Git repositories is supported by using the :class:`Repo` +class directly. Operations on an annex of a bare repository are accessible +via the :attr:`Repo.bare_annex` attribute. + +The :class:`Repo` and :class:`Worktree` classes in this module implement the +"flyweight" pattern. This means that, within the same process, creating +instances of :class:`Repo` and :class:`Worktree` always yields the same +instance for the same path location. .. currentmodule:: datalad_core.repo .. autosummary:: @@ -10,12 +26,20 @@ Repo Worktree + Annex + BareRepoAnnex """ __all__ = [ 'Repo', 'Worktree', + 'Annex', + 'BareRepoAnnex', ] +from .annex import ( + Annex, + BareRepoAnnex, +) from .repo import Repo from .worktree import Worktree diff --git a/datalad_core/repo/annex.py b/datalad_core/repo/annex.py new file mode 100644 index 0000000..f0bb9ee --- /dev/null +++ b/datalad_core/repo/annex.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from pathlib import Path + +from datalad_core.runners import ( + CommandError, + call_annex_json_lines, + call_git_lines, +) + + +class AnnexBase: + """Base class for annex handler + + This class should not be used directyly, but any of its subclasses. + + Methods of this class implement operations that work with annexes + in bare and non-bare repositories. + """ + + def __init__(self, path: Path): + self._path = path + self._uuid: str | None = None + + @property + def path(self) -> Path: + """Path used by git-annex commands as PWD""" + return self._path + + # not using a cached_property, because it would not prevent write-access + @property + def uuid(self) -> str: + """UUID identifier of the local ("here") annex + + Once accessed, the value is cached for the lifetime of the instance. + """ + + if self._uuid is None: + for ax in self.list_annexes(): + if ax['here'] is True: + self._uuid = str(ax['uuid']) + return self._uuid + if self._uuid is None: # pragma: no cover + msg = 'annex unexpectedly has no UUID' + raise RuntimeError(msg) + return self._uuid + + def list_annexes(self) -> list[dict[str, str | bool]]: + """Returns a list with information on known annexed for a repository + + Each item in the list corresponds to an annex and is a dictionary + with annex properties. Dictionary ``str`` keys are + + - ``uuid``: annex UUID identifier + - ``description``: a custom or generated description of that annex + - ``here``: a boolean flag whether that annex is local to this + repository + - ``trust``: a label (``trusted``, ``semitrusted``, ``untrusted``) + indication the trust assignment for that annex + """ + res = list(call_annex_json_lines(['info', '--fast'], cwd=self.path)) + if len(res) != 1: # pragma: no cover + msg = 'unexpected output from git-annex-info' + raise RuntimeError(msg) + info = res[0] + annexes: list[dict[str, str | bool]] = [] + for rt in ( + 'untrusted repositories', + 'semitrusted repositories', + 'trusted repositories', + ): + for r in info[rt]: + # TODO: make ENUM? + r.update(trust=rt.split(' ')[0]) + annexes.append(r) + return annexes + + +class BareRepoAnnex(AnnexBase): + # ATTN: This class should not get (many) additional methods. Instead, they + # should all go into AnnexBase, if they can work with bare and non-bare + # repositories. The purpose of this class is mainly to enforce use with a + # bare repository in its __init__() + """Interface for an annex in a bare Git repository + + Annex operations are executed in the context of a Git repository. + This repository must be a bare repository. An exception is raised, + if this class is used with a non-bare repository. + """ + + def __init__(self, path: Path): + """ + The given ``path`` must point to a bare Git repository and is used + to resolve and confirm the presence of an annex. + """ + bare, annex_loc = call_git_lines( + [ + '-C', + str(path), + 'rev-parse', + '--path-format=absolute', + '--is-bare-repository', + '--git-path', + 'annex', + ], + ) + if bare != 'true': + msg = f'not a bare repository at {path}' + raise ValueError(msg) + # this simple test is also what is done in legacy AnnexRepo + annex_path = Path(annex_loc) + if not annex_path.exists(): + msg = f'no repository annex found at {annex_path}' + raise ValueError(msg) + super().__init__(annex_path.parent) + + +class Annex(AnnexBase): + """Interface for an annex in a non-bare Git repository + + Annex operations are executed in the context of a Git worktree. + """ + + def __init__(self, path: Path): + """ + The given ``path`` must point to a Git repository worktree + and is used to resolve and confirm the presence of an annex. + """ + try: + annex_loc, worktree_loc = call_git_lines( + [ + '-C', + str(path), + 'rev-parse', + '--path-format=absolute', + '--git-path', + 'annex', + '--show-toplevel', + ], + ) + except CommandError as e: + msg = f'cannot resolve paths for a worktree with an annex at {path}' + raise ValueError(msg) from e + # this simple test is also what is done in legacy AnnexRepo + annex_path = Path(annex_loc) + if not annex_path.exists(): + msg = f'no repository annex found at {annex_path}' + raise ValueError(msg) + super().__init__(Path(worktree_loc)) diff --git a/datalad_core/repo/repo.py b/datalad_core/repo/repo.py index 71ceeb8..c6afc06 100644 --- a/datalad_core/repo/repo.py +++ b/datalad_core/repo/repo.py @@ -18,8 +18,12 @@ LocalGitConfig, get_manager, ) +from datalad_core.repo.annex import BareRepoAnnex from datalad_core.repo.gitmanaged import GitManaged -from datalad_core.runners import call_git +from datalad_core.runners import ( + call_annex_json_lines, + call_git, +) class Repo(GitManaged): @@ -45,6 +49,7 @@ def __init__(self, path: Path): def reset(self) -> None: super().reset() self._config: ConfigManager | None = None + self._annex: BareRepoAnnex | None = None @property def config(self) -> ConfigManager: @@ -87,6 +92,56 @@ def config(self) -> ConfigManager: self._config = lman return self._config + def init_annex( + self, + description: str | None = None, + *, + autoenable_remotes: bool = True, + ) -> BareRepoAnnex: + """ """ + if self.config.get('core.bare', False).value is False: + msg = ( + 'Cannot initialize annex in a non-bare repository, ' + 'use Worktree.init_annex()' + ) + raise TypeError(msg) + self._init_annex( + self.path, + description=description, + autoenable_remotes=autoenable_remotes, + ) + annex = self.bare_annex + if annex is None: # pragma: no cover + msg = 'could not initialize annex unexpectedly' + raise RuntimeError(msg) + return annex + + # we name this "bare_annex" not just "annex", even though it is clunky, + # to avoid the confusions associated with "but it has an annex, it is + # just not a bare respoitory" + @property + def bare_annex(self) -> BareRepoAnnex | None: + """Handler for a bare repository's annex + + If there is no initialized annex, or the repository is not bare, + this will be ``None``. + + To get a handler for a non-bare repository's annex use + :attr:`Worktree.annex`. + """ + if self.config.get('core.bare', False).value is False: + return None + if self._annex is None: + try: + self._annex = BareRepoAnnex(self.path) + except ValueError: + # resetting it to None means that we will keep trying to + # locate an annex each time. I believe this is a sensible + # behavior. A once-present annex is unlikely to go away, + # but an annex could be initialized at any time + self._annex = None + return self._annex + @classmethod def init_at(cls, path: Path) -> Repo: """Initialize a bare repository in an existing directory @@ -102,3 +157,21 @@ def init_at(cls, path: Path) -> Repo: capture_output=True, ) return cls(path) + + def _init_annex( + self, + exec_path: Path, + *, + description: str | None = None, + autoenable_remotes: bool = True, + ) -> None: + """ """ + cmd = ['init'] + if not autoenable_remotes: + # no, we do not set --autoenable, this is a RepoAnnex feature + cmd.append('--no-autoenable') + if description is not None: + cmd.append(description) + # collect all items, we only expect a single one + # TODO: consume()? + list(call_annex_json_lines(cmd, cwd=exec_path)) diff --git a/datalad_core/repo/tests/test_annex.py b/datalad_core/repo/tests/test_annex.py new file mode 100644 index 0000000..9b00bc0 --- /dev/null +++ b/datalad_core/repo/tests/test_annex.py @@ -0,0 +1,116 @@ +import pytest + +from datalad_core.runners import ( + CommandError, +) + +from ..annex import ( + Annex, + BareRepoAnnex, +) +from ..repo import Repo +from ..worktree import Worktree + + +def test_barerepoannex_error(baregitrepo): + with pytest.raises(ValueError, match='no repository annex found'): + BareRepoAnnex(baregitrepo) + + +def test_barerepoannex_nonbare(annexrepo): + with pytest.raises(ValueError, match='not a bare repository'): + BareRepoAnnex(annexrepo) + + wt = Worktree(annexrepo) + with pytest.raises(TypeError, match='Cannot initialize annex in a non-bare repo'): + wt.repo.init_annex() + + assert wt.repo.bare_annex is None + + +# # check annex location resolution. it should be fairly robust and +# # pretty much always find the annex, as long as the path points +# # anywhere inside a git repo +# annex0 = BareRepoAnnex(annexrepo) +# annex1 = BareRepoAnnex(annexrepo / '.git') +# annex2 = BareRepoAnnex(annexrepo / '.git' / 'annex') +# testdir = annexrepo / 'somedir' +# testdir.mkdir() +# annex3 = BareRepoAnnex(testdir) +# assert annex0.path == annex1.path == annex2.path == annex3.path + + +def test_annex_error(annexrepo): + with pytest.raises(ValueError, match='cannot resolve path'): + Annex(Worktree(annexrepo).git_dir) + + +def test_annex_noannex(gitrepo): + with pytest.raises(ValueError, match='no repository annex found'): + Annex(gitrepo) + + wt = Worktree(gitrepo) + assert wt.annex is None + # and it keeps it that way on repeated trials + assert wt.annex is None + + +def test_annex(annexrepo): + annex0 = Annex(annexrepo) + # initialization is robust to "anywhere in repo" + testdir = annexrepo / 'somedir' + testdir.mkdir() + annex1 = Annex(testdir) + assert annex0.path == annex1.path == annexrepo + assert annex0.uuid + + +def test_repo_init_annex_error(baregitrepo): + repo = Repo(baregitrepo) + # we take the place of the annex + (repo.path / 'annex').touch() + with pytest.raises(CommandError, match='fileExist'): + repo.init_annex() + + +def test_worktree_init_annex(gitrepo): + wt = Worktree(gitrepo) + annex = wt.init_annex() + assert wt.annex is annex + assert wt.annex.uuid + + +def test_repo_init_annex(baregitrepo): + repo = Repo(baregitrepo) + assert repo.bare_annex is None + + # setting the flag has no effect here, it just exercises the + # code path + annex = repo.init_annex('testannex', autoenable_remotes=False) + assert annex is repo.bare_annex + assert repo.bare_annex.uuid + # ask again to exercise cached code path + assert repo.bare_annex.uuid + + +def test_relocate_repo_w_annex(tmp_path): + wt_dir = tmp_path / 'wt' + orig_repo_dir = tmp_path / 'orig_repo' + new_repo_dir = tmp_path / 'new_repo' + + wt_dir.mkdir() + + wt = Worktree.init_at(wt_dir, gitdir=orig_repo_dir) + wt.init_annex() + assert wt.repo.path == orig_repo_dir + # annex commands run in the context of the worktree, not the repo + assert wt.annex.path == wt.path + annexes = wt.annex.list_annexes() + assert len(annexes) > 1 + + wt_new = Worktree.init_at(wt_dir, gitdir=new_repo_dir) + assert wt_new.repo.path == new_repo_dir + assert (new_repo_dir / 'annex').is_dir() + assert wt_new.annex.path == wt.path + # running annex commands continues to work after relocation + assert annexes == wt_new.annex.list_annexes() diff --git a/datalad_core/repo/tests/test_worktree.py b/datalad_core/repo/tests/test_worktree.py index 09b4f4d..131f600 100644 --- a/datalad_core/repo/tests/test_worktree.py +++ b/datalad_core/repo/tests/test_worktree.py @@ -107,7 +107,7 @@ def test_worktree_init_at(tmp_path): # init alternative worktree. This is not a "linked" worktree. # instead this merely points to the same repository. changes - # made in this worktree will cause unsychronized differences + # made in this worktree will cause unsynchronized differences # at `orig_wt`. Likely not a use case, but we are testing the # proper functioning of the mechanics anyways alt_wt_path = tmp_path / 'alt_wt' diff --git a/datalad_core/repo/worktree.py b/datalad_core/repo/worktree.py index b00dc4b..aa40566 100644 --- a/datalad_core/repo/worktree.py +++ b/datalad_core/repo/worktree.py @@ -13,6 +13,7 @@ ConfigManager, WorktreeGitConfig, ) +from datalad_core.repo.annex import Annex from datalad_core.repo.gitmanaged import GitManaged from datalad_core.repo.repo import Repo from datalad_core.runners import call_git @@ -38,6 +39,7 @@ def __init__( def reset(self) -> None: super().reset() + self._annex: Annex | None = None self._config: ConfigManager | None = None self._repo: Repo | None = None @@ -125,6 +127,38 @@ def repo(self) -> Repo: self._repo = Repo(self.git_common_dir) return self._repo + def init_annex( + self, + description: str | None = None, + *, + autoenable_remotes: bool = True, + ) -> Annex: + """ """ + # refuse for non-bare + self.repo._init_annex( # noqa: SLF001 + self.path, + description=description, + autoenable_remotes=autoenable_remotes, + ) + annex = self.annex + if annex is None: # pragma: no cover + msg = 'could not initialize annex unexpectedly' + raise RuntimeError(msg) + return annex + + @property + def annex(self) -> Annex | None: + if self._annex is None: + try: + self._annex = Annex(self.path) + except ValueError: + # resetting it to None means that we will keep trying to + # locate an annex each time. I believe this is a sensible + # behavior. A once-present annex is unlikely to go away, + # but an annex could be initialized at any time + self._annex = None + return self._annex + @classmethod def init_at(cls, path: Path, gitdir: Path | None = None) -> Worktree: """Initialize a worktree for a new/existing repository in a directory @@ -150,7 +184,7 @@ def init_at(cls, path: Path, gitdir: Path | None = None) -> Worktree: # this call could have relocated the underlying repo. # drop all previous references and evaluate from scratch. # we could do upfront inspection instead, but this is - # resonably cheap, and safeer to do unconditionally. + # reasonably cheap, and safeer to do unconditionally. wt.repo.reset() wt.reset() return wt