diff --git a/conftest.py b/conftest.py index 01b8b6a..45e8800 100644 --- a/conftest.py +++ b/conftest.py @@ -2,13 +2,19 @@ __all__ = [ 'cfgman', + 'bareannexrepo', 'baregitrepo', + 'annexrepo', 'gitrepo', 'verify_pristine_gitconfig_global', ] from datalad_core.tests.fixtures import ( + # function-scope temporary Git repo with an initialized annex + annexrepo, + # function-scope temporary, bare Git repo with an initialized annex + bareannexrepo, # function-scope temporary, bare Git repo baregitrepo, # function-scope config manager diff --git a/datalad_core/repo/__init__.py b/datalad_core/repo/__init__.py index a5cb526..8454ad1 100644 --- a/datalad_core/repo/__init__.py +++ b/datalad_core/repo/__init__.py @@ -1,8 +1,24 @@ """Repository and worktree representations -The classes in this module implement the "flyweight" pattern. This means that, -within the same process, creating instances of :class:`Repo` and -:class:`Worktree` always yields the same instance for the same path location. +This module provides the essential types for working with Git/git-annex +repositories and DataLad datasets. + +For most use cases, the :class:`Worktree` class is the main entrypoint. It can +be pointed to an existing Git repository checkout, or a new repository can be +created via its :meth:`Worktree.init_at` class method. Access to operations on +the underlying Git repository (that may be shared by additional worktrees) is +possible via the :attr:`Worktree.repo` attribute. An optional (git-)annex can +be initialized (:meth:`Worktree.init_annex`), and accessed via +:attr:`Worktree.annex`. + +Working with bare Git repositories is supported by using the :class:`Repo` +class directly. Operations on an annex of a bare repository are accessible +via the :attr:`Repo.bare_annex` attribute. + +The :class:`Repo` and :class:`Worktree` classes in this module implement the +"flyweight" pattern. This means that, within the same process, creating +instances of :class:`Repo` and :class:`Worktree` always yields the same +instance for the same path location. .. currentmodule:: datalad_core.repo .. autosummary:: @@ -10,12 +26,20 @@ Repo Worktree + Annex + BareRepoAnnex """ __all__ = [ 'Repo', 'Worktree', + 'Annex', + 'BareRepoAnnex', ] +from .annex import ( + Annex, + BareRepoAnnex, +) from .repo import Repo from .worktree import Worktree diff --git a/datalad_core/repo/annex.py b/datalad_core/repo/annex.py new file mode 100644 index 0000000..f0bb9ee --- /dev/null +++ b/datalad_core/repo/annex.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from pathlib import Path + +from datalad_core.runners import ( + CommandError, + call_annex_json_lines, + call_git_lines, +) + + +class AnnexBase: + """Base class for annex handler + + This class should not be used directyly, but any of its subclasses. + + Methods of this class implement operations that work with annexes + in bare and non-bare repositories. + """ + + def __init__(self, path: Path): + self._path = path + self._uuid: str | None = None + + @property + def path(self) -> Path: + """Path used by git-annex commands as PWD""" + return self._path + + # not using a cached_property, because it would not prevent write-access + @property + def uuid(self) -> str: + """UUID identifier of the local ("here") annex + + Once accessed, the value is cached for the lifetime of the instance. + """ + + if self._uuid is None: + for ax in self.list_annexes(): + if ax['here'] is True: + self._uuid = str(ax['uuid']) + return self._uuid + if self._uuid is None: # pragma: no cover + msg = 'annex unexpectedly has no UUID' + raise RuntimeError(msg) + return self._uuid + + def list_annexes(self) -> list[dict[str, str | bool]]: + """Returns a list with information on known annexed for a repository + + Each item in the list corresponds to an annex and is a dictionary + with annex properties. Dictionary ``str`` keys are + + - ``uuid``: annex UUID identifier + - ``description``: a custom or generated description of that annex + - ``here``: a boolean flag whether that annex is local to this + repository + - ``trust``: a label (``trusted``, ``semitrusted``, ``untrusted``) + indication the trust assignment for that annex + """ + res = list(call_annex_json_lines(['info', '--fast'], cwd=self.path)) + if len(res) != 1: # pragma: no cover + msg = 'unexpected output from git-annex-info' + raise RuntimeError(msg) + info = res[0] + annexes: list[dict[str, str | bool]] = [] + for rt in ( + 'untrusted repositories', + 'semitrusted repositories', + 'trusted repositories', + ): + for r in info[rt]: + # TODO: make ENUM? + r.update(trust=rt.split(' ')[0]) + annexes.append(r) + return annexes + + +class BareRepoAnnex(AnnexBase): + # ATTN: This class should not get (many) additional methods. Instead, they + # should all go into AnnexBase, if they can work with bare and non-bare + # repositories. The purpose of this class is mainly to enforce use with a + # bare repository in its __init__() + """Interface for an annex in a bare Git repository + + Annex operations are executed in the context of a Git repository. + This repository must be a bare repository. An exception is raised, + if this class is used with a non-bare repository. + """ + + def __init__(self, path: Path): + """ + The given ``path`` must point to a bare Git repository and is used + to resolve and confirm the presence of an annex. + """ + bare, annex_loc = call_git_lines( + [ + '-C', + str(path), + 'rev-parse', + '--path-format=absolute', + '--is-bare-repository', + '--git-path', + 'annex', + ], + ) + if bare != 'true': + msg = f'not a bare repository at {path}' + raise ValueError(msg) + # this simple test is also what is done in legacy AnnexRepo + annex_path = Path(annex_loc) + if not annex_path.exists(): + msg = f'no repository annex found at {annex_path}' + raise ValueError(msg) + super().__init__(annex_path.parent) + + +class Annex(AnnexBase): + """Interface for an annex in a non-bare Git repository + + Annex operations are executed in the context of a Git worktree. + """ + + def __init__(self, path: Path): + """ + The given ``path`` must point to a Git repository worktree + and is used to resolve and confirm the presence of an annex. + """ + try: + annex_loc, worktree_loc = call_git_lines( + [ + '-C', + str(path), + 'rev-parse', + '--path-format=absolute', + '--git-path', + 'annex', + '--show-toplevel', + ], + ) + except CommandError as e: + msg = f'cannot resolve paths for a worktree with an annex at {path}' + raise ValueError(msg) from e + # this simple test is also what is done in legacy AnnexRepo + annex_path = Path(annex_loc) + if not annex_path.exists(): + msg = f'no repository annex found at {annex_path}' + raise ValueError(msg) + super().__init__(Path(worktree_loc)) diff --git a/datalad_core/repo/repo.py b/datalad_core/repo/repo.py index 71ceeb8..b69ca09 100644 --- a/datalad_core/repo/repo.py +++ b/datalad_core/repo/repo.py @@ -18,7 +18,9 @@ LocalGitConfig, get_manager, ) +from datalad_core.repo.annex import BareRepoAnnex from datalad_core.repo.gitmanaged import GitManaged +from datalad_core.repo.utils import init_annex_at from datalad_core.runners import call_git @@ -45,6 +47,7 @@ def __init__(self, path: Path): def reset(self) -> None: super().reset() self._config: ConfigManager | None = None + self._annex: BareRepoAnnex | None = None @property def config(self) -> ConfigManager: @@ -87,6 +90,56 @@ def config(self) -> ConfigManager: self._config = lman return self._config + def init_annex( + self, + description: str | None = None, + *, + autoenable_remotes: bool = True, + ) -> BareRepoAnnex: + """ """ + if self.config.get('core.bare', False).value is False: + msg = ( + 'Cannot initialize annex in a non-bare repository, ' + 'use Worktree.init_annex()' + ) + raise TypeError(msg) + init_annex_at( + self.path, + description=description, + autoenable_remotes=autoenable_remotes, + ) + annex = self.bare_annex + if annex is None: # pragma: no cover + msg = 'could not initialize annex unexpectedly' + raise RuntimeError(msg) + return annex + + # we name this "bare_annex" not just "annex", even though it is clunky, + # to avoid the confusions associated with "but it has an annex, it is + # just not a bare respoitory" + @property + def bare_annex(self) -> BareRepoAnnex | None: + """Handler for a bare repository's annex + + If there is no initialized annex, or the repository is not bare, + this will be ``None``. + + To get a handler for a non-bare repository's annex use + :attr:`Worktree.annex`. + """ + if self.config.get('core.bare', False).value is False: + return None + if self._annex is None: + try: + self._annex = BareRepoAnnex(self.path) + except ValueError: + # resetting it to None means that we will keep trying to + # locate an annex each time. I believe this is a sensible + # behavior. A once-present annex is unlikely to go away, + # but an annex could be initialized at any time + self._annex = None + return self._annex + @classmethod def init_at(cls, path: Path) -> Repo: """Initialize a bare repository in an existing directory diff --git a/datalad_core/repo/tests/test_annex.py b/datalad_core/repo/tests/test_annex.py new file mode 100644 index 0000000..d14b8c5 --- /dev/null +++ b/datalad_core/repo/tests/test_annex.py @@ -0,0 +1,104 @@ +import pytest + +from datalad_core.runners import ( + CommandError, +) + +from ..annex import ( + Annex, + BareRepoAnnex, +) +from ..repo import Repo +from ..worktree import Worktree + + +def test_barerepoannex_error(baregitrepo): + with pytest.raises(ValueError, match='no repository annex found'): + BareRepoAnnex(baregitrepo) + + +def test_barerepoannex_nonbare(annexrepo): + with pytest.raises(ValueError, match='not a bare repository'): + BareRepoAnnex(annexrepo) + + wt = Worktree(annexrepo) + with pytest.raises(TypeError, match='Cannot initialize annex in a non-bare repo'): + wt.repo.init_annex() + + assert wt.repo.bare_annex is None + + +def test_annex_error(annexrepo): + with pytest.raises(ValueError, match='cannot resolve path'): + Annex(Worktree(annexrepo).git_dir) + + +def test_annex_noannex(gitrepo): + with pytest.raises(ValueError, match='no repository annex found'): + Annex(gitrepo) + + wt = Worktree(gitrepo) + assert wt.annex is None + # and it keeps it that way on repeated trials + assert wt.annex is None + + +def test_annex(annexrepo): + annex0 = Annex(annexrepo) + # initialization is robust to "anywhere in repo" + testdir = annexrepo / 'somedir' + testdir.mkdir() + annex1 = Annex(testdir) + assert annex0.path == annex1.path == annexrepo + assert annex0.uuid + + +def test_repo_init_annex_error(baregitrepo): + repo = Repo(baregitrepo) + # we take the place of the annex + (repo.path / 'annex').touch() + with pytest.raises(CommandError, match='fileExist|file already exists'): + repo.init_annex() + + +def test_worktree_init_annex(gitrepo): + wt = Worktree(gitrepo) + annex = wt.init_annex() + assert wt.annex is annex + assert wt.annex.uuid + + +def test_repo_init_annex(baregitrepo): + repo = Repo(baregitrepo) + assert repo.bare_annex is None + + # setting the flag has no effect here, it just exercises the + # code path + annex = repo.init_annex('testannex', autoenable_remotes=False) + assert annex is repo.bare_annex + assert repo.bare_annex.uuid + # ask again to exercise cached code path + assert repo.bare_annex.uuid + + +def test_relocate_repo_w_annex(tmp_path): + wt_dir = tmp_path / 'wt' + orig_repo_dir = tmp_path / 'orig_repo' + new_repo_dir = tmp_path / 'new_repo' + + wt_dir.mkdir() + + wt = Worktree.init_at(wt_dir, gitdir=orig_repo_dir) + wt.init_annex() + assert wt.repo.path == orig_repo_dir + # annex commands run in the context of the worktree, not the repo + assert wt.annex.path == wt.path + annexes = wt.annex.list_annexes() + assert len(annexes) > 1 + + wt_new = Worktree.init_at(wt_dir, gitdir=new_repo_dir) + assert wt_new.repo.path == new_repo_dir + assert (new_repo_dir / 'annex').is_dir() + assert wt_new.annex.path == wt.path + # running annex commands continues to work after relocation + assert annexes == wt_new.annex.list_annexes() diff --git a/datalad_core/repo/tests/test_worktree.py b/datalad_core/repo/tests/test_worktree.py index 09b4f4d..131f600 100644 --- a/datalad_core/repo/tests/test_worktree.py +++ b/datalad_core/repo/tests/test_worktree.py @@ -107,7 +107,7 @@ def test_worktree_init_at(tmp_path): # init alternative worktree. This is not a "linked" worktree. # instead this merely points to the same repository. changes - # made in this worktree will cause unsychronized differences + # made in this worktree will cause unsynchronized differences # at `orig_wt`. Likely not a use case, but we are testing the # proper functioning of the mechanics anyways alt_wt_path = tmp_path / 'alt_wt' diff --git a/datalad_core/repo/utils.py b/datalad_core/repo/utils.py new file mode 100644 index 0000000..a543dfc --- /dev/null +++ b/datalad_core/repo/utils.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from datalad_core.runners import call_annex_json_lines + +if TYPE_CHECKING: + from pathlib import Path + + +def init_annex_at( + path: Path, + *, + description: str | None = None, + autoenable_remotes: bool = True, +) -> None: + """Call ``git-annex init`` at a given ``path``""" + cmd = ['init'] + if not autoenable_remotes: + # no, we do not set --autoenable, this is a RepoAnnex feature + cmd.append('--no-autoenable') + if description is not None: + cmd.append(description) + # collect all items, we only expect a single one + # TODO: consume()? + list(call_annex_json_lines(cmd, cwd=path)) diff --git a/datalad_core/repo/worktree.py b/datalad_core/repo/worktree.py index b00dc4b..ff39091 100644 --- a/datalad_core/repo/worktree.py +++ b/datalad_core/repo/worktree.py @@ -13,8 +13,10 @@ ConfigManager, WorktreeGitConfig, ) +from datalad_core.repo.annex import Annex from datalad_core.repo.gitmanaged import GitManaged from datalad_core.repo.repo import Repo +from datalad_core.repo.utils import init_annex_at from datalad_core.runners import call_git @@ -38,6 +40,7 @@ def __init__( def reset(self) -> None: super().reset() + self._annex: Annex | None = None self._config: ConfigManager | None = None self._repo: Repo | None = None @@ -125,6 +128,38 @@ def repo(self) -> Repo: self._repo = Repo(self.git_common_dir) return self._repo + def init_annex( + self, + description: str | None = None, + *, + autoenable_remotes: bool = True, + ) -> Annex: + """ """ + # refuse for non-bare + init_annex_at( + self.path, + description=description, + autoenable_remotes=autoenable_remotes, + ) + annex = self.annex + if annex is None: # pragma: no cover + msg = 'could not initialize annex unexpectedly' + raise RuntimeError(msg) + return annex + + @property + def annex(self) -> Annex | None: + if self._annex is None: + try: + self._annex = Annex(self.path) + except ValueError: + # resetting it to None means that we will keep trying to + # locate an annex each time. I believe this is a sensible + # behavior. A once-present annex is unlikely to go away, + # but an annex could be initialized at any time + self._annex = None + return self._annex + @classmethod def init_at(cls, path: Path, gitdir: Path | None = None) -> Worktree: """Initialize a worktree for a new/existing repository in a directory @@ -150,7 +185,7 @@ def init_at(cls, path: Path, gitdir: Path | None = None) -> Worktree: # this call could have relocated the underlying repo. # drop all previous references and evaluate from scratch. # we could do upfront inspection instead, but this is - # resonably cheap, and safeer to do unconditionally. + # reasonably cheap, and safeer to do unconditionally. wt.repo.reset() wt.reset() return wt diff --git a/datalad_core/runners/__init__.py b/datalad_core/runners/__init__.py index 5b59e2f..24c9586 100644 --- a/datalad_core/runners/__init__.py +++ b/datalad_core/runners/__init__.py @@ -16,6 +16,7 @@ call_git_lines call_git_oneline call_git_success + call_annex_json_lines iter_subproc iter_git_subproc CommandError @@ -29,6 +30,7 @@ 'call_git_lines', 'call_git_oneline', 'call_git_success', + 'call_annex_json_lines', ] @@ -37,6 +39,7 @@ iter_subproc, ) +from .annex import call_annex_json_lines from .git import ( call_git, call_git_lines, diff --git a/datalad_core/runners/annex.py b/datalad_core/runners/annex.py new file mode 100644 index 0000000..4e4f71c --- /dev/null +++ b/datalad_core/runners/annex.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +from datasalad.itertools import ( + itemize, + load_json, +) + +from datalad_core.runners.git import iter_git_subproc +from datalad_core.runners.imports import CommandError + +if TYPE_CHECKING: + from collections.abc import Generator + from pathlib import Path + + +def call_annex_json_lines( + annex_args: list[str], + *, + git_args: list[str] | None = None, + cwd: Path | None = None, +) -> Generator[Any]: + """Runs a git-annex command and yields JSON-based results + + This function can only be used with git-annex commands that + support both ``--json`` and ``--json-error-messages``. These + options are automatically and unconditionally added to the + command arguments. + + If ``git_args`` are provided, they will be prepended to the + annex command (i.e., ``git annex ``. + + The ``cwd`` parameter is passed on to :func:`iter_git_subproc`, + which perform the actual command execution. + """ + cmd = git_args or [] + cmd.append('annex') + cmd.extend(annex_args) + cmd.extend(('--json', '--json-error-messages')) + + # we collect error results in order to be able to enrich an eventual + # command error. this could be done in consuming code, but it + # makes senses to MIH to do this in this central place. + # the motivation here is that git-annex with --json-error-messages + # will communicate errors via stdout (part of JSON lines), so we yield + # them. But if consuming code does not actively look for information + # on errors in them, they will only see a CommandError, which only + # carries stderr capture (which hardly ever has the real info). + error_results = [] + + try: + with iter_git_subproc(cmd, cwd=cwd) as annex_proc: + for res in load_json(itemize(annex_proc, sep=None)): + if res.get('success', True) is False: + error_results.append( + { + k: v + for k, v in res.items() + if k in ('command', 'input', 'error-messages', 'note') + } + ) + yield res + except CommandError as e: + # TODO: I think we'd rather want to have an exception subclass here + # that can take this information in a structured fashion, and does the + # formatting on access + e.msg = _format_errors(error_results) + raise + + +def _format_errors(err: list[dict[str, str | list[str]]]) -> str: + nerrors = len(err) + if not nerrors: + return '' + if nerrors == 1: + return _format_error(err[0]) + return f'{nerrors} errors: {[_format_error(e) for e in err]}' + + +def _format_error(err: dict[str, str | list[str]]) -> str: + # we cannot newline-join in an f-string with PY3.9, so do upfront + error_messages = err.get('error-messages') + if error_messages: + formated_error_msg = '\n'.join(err['error-messages']) + return ''.join( + ( + f'{err["command"]!r} ' if 'command' in err else '', + 'failed', + f' for input {err["input"]!r}' if err.get('input') else '', + f' with {formated_error_msg!r}' if error_messages else '', + f' [note: {err["note"]}]' if 'note' in err else '', + ) + ) diff --git a/datalad_core/runners/tests/test_callannex.py b/datalad_core/runners/tests/test_callannex.py new file mode 100644 index 0000000..7a8e51f --- /dev/null +++ b/datalad_core/runners/tests/test_callannex.py @@ -0,0 +1,54 @@ +import pytest + +from .. import CommandError +from ..annex import call_annex_json_lines + +# we do not want to afford the more_itertools dependency at this point. +# we nevertheless want to clarity +consume = list + + +def test_call_annex_json_lines(tmp_path, bareannexrepo): + # does not hide fundamental errors + with pytest.raises((FileNotFoundError, NotADirectoryError)): + consume(call_annex_json_lines(['info'], cwd=tmp_path / 'nothere')) + with pytest.raises(CommandError, match='cannot change to'): + consume(call_annex_json_lines(['info'], git_args=['-C', tmp_path / 'nothere'])) + with pytest.raises(CommandError, match='Not in a git repo'): + consume(call_annex_json_lines(['info'], cwd=tmp_path)) + + # simple test: 'gitannex-info' yields only a single JSON line. + # primarily checks the function parameters + res1 = list(call_annex_json_lines(['info'], cwd=bareannexrepo)) + # same as above, but use gitargs to change to the repo dir + res2 = list(call_annex_json_lines(['info'], git_args=['-C', str(bareannexrepo)])) + assert len(res1) == len(res2) + res1 = res1[0] + res2 = res2[0] + # strip volatile properties + for r in (res1, res2): + r.pop('available local disk space') + assert res1 == res2 + res = res1 + # we get a fully decodes structure back (value is an int) + assert res['local annex keys'] == 0 + # standard keys + assert res['success'] is True + assert res['error-messages'] == [] + assert res['command'] == 'info' + + local_root = '.' + local_remote = 'here' + res = list( + call_annex_json_lines(['info', local_root, local_remote], cwd=bareannexrepo) + ) + # we get one result per request + assert len(res) == len((local_root, local_remote)) + # order matches + assert res[0]['input'] == [local_root] + assert res[1]['input'] == [local_remote] + + +def test_call_annex_json_lines_multierrors(annexrepo): + with pytest.raises(CommandError, match='2 errors.*not a directory'): + consume(call_annex_json_lines(['info', 'absent', 'gone'], cwd=annexrepo)) diff --git a/datalad_core/tests/fixtures.py b/datalad_core/tests/fixtures.py index c2b57c3..f1a5f02 100644 --- a/datalad_core/tests/fixtures.py +++ b/datalad_core/tests/fixtures.py @@ -118,3 +118,25 @@ def baregitrepo(tmp_path_factory) -> Generator[Path]: capture_output=True, ) return path + + +@pytest.fixture(autouse=False, scope='function') # noqa: PT003 +def bareannexrepo(baregitrepo) -> Generator[Path]: + """Yield the path to a bare Git repository with an initialized annex""" + call_git( + ['annex', 'init'], + cwd=baregitrepo, + capture_output=True, + ) + return baregitrepo + + +@pytest.fixture(autouse=False, scope='function') # noqa: PT003 +def annexrepo(gitrepo) -> Generator[Path]: + """Yield the path to a Git repository with an initialized annex""" + call_git( + ['annex', 'init'], + cwd=gitrepo, + capture_output=True, + ) + return gitrepo