Skip to content

Commit

Permalink
feat: annex interfaces and initialization
Browse files Browse the repository at this point in the history
`Repo` and `Worktree` received dedicated, optional support for annexes,
and their initialization.

The concept is is substantially different from that implemented in
legacy DataLad. There, an `AnnexRepo` class was derived from a `GitRepo`
class and extended and overwrote individual methods, forming a
relatively high-level API.

Here, any `Repo` or `Worktree` can have an optional annex. All
operations related to that annex are implemented in dedicated handlers
that are fully independent of a `Repo` or `Worktree` instance.

The aim is to reduce the complex interdependencies that cripple the
validity and robustness of the legacy implementations. Git is used more
"directly", even in git-annex repositories and git-annex is used
more directly, and agnostic of the context and demands of high-level
operations.

This has important consequences. For example, a method like the
legacy `AnnexRepo.save()` cannot exist, because there is no
`GitRepo.save` that it can override and "do the right thing" for a
git-annex repository.

This is acceptable here, because the API provided in this module
is not targeting the level of compound/convenience operations like
`save()` that aim to alleviate a developer's required expertise.
Instead, the aim here is to provide primitives that can be used in a
higher-level (possibly function-based) API.
  • Loading branch information
mih committed Oct 15, 2024
1 parent a61719a commit 25b04d9
Show file tree
Hide file tree
Showing 4 changed files with 334 additions and 1 deletion.
118 changes: 118 additions & 0 deletions datalad_core/repo/annex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from __future__ import annotations

from pathlib import Path

from datalad_core.runners import (
CommandError,
call_annex_json_lines,
call_git_lines,
)


class AnnexBase:
""" """

def __init__(self, path: Path):
self._path = path
self._uuid = None

@property
def path(self) -> Path:
"""Path used by git-annex commands as PWD"""
return self._path

# not using a cached_property, because it would not prevent write-access
@property
def uuid(self) -> str:
if self._uuid is None:
for ax in self.list_annexes():
if ax['here'] is True:
self._uuid = ax['uuid']
return self._uuid
if self._uuid is None: # pragma: no cover
msg = 'annex unexpectedly has no UUID'
raise RuntimeError(msg)
return self._uuid

def list_annexes(self) -> list[dict[str, str | bool]]:
res = list(call_annex_json_lines(['info', '--fast'], cwd=self.path))
if len(res) != 1: # pragma: no cover
msg = 'unexpected output from git-annex-info'
raise RuntimeError(msg)
info = res[0]
annexes: list[dict[str, str | bool]] = []
for rt in (
'untrusted repositories',
'semitrusted repositories',
'trusted repositories',
):
for r in info[rt]:
# TODO: make ENUM?
r.update(trust=rt.split(' ')[0])
annexes.append(r)
return annexes


class BareRepoAnnex(AnnexBase):
# ATTN: This class should not get additional methods. Instead, they should
# all go into AnnexBase, if they can work with bare and non-bare
# repositories. The purpose of this class is solely to enforce use with
# a bare repository in its __init__()
""" """

def __init__(self, path: Path):
"""
The given ``path`` must point to a bare Git repository and is used
to resolve and confirm the presence of an annex.
"""
bare, annex_loc = call_git_lines(
[
'-C',
str(path),
'rev-parse',
'--path-format=absolute',
'--is-bare-repository',
'--git-path',
'annex',
],
)
if bare != 'true':
msg = f'not a bare repository at {path}'
raise ValueError(msg)
# this simple test is also what is done in legacy AnnexRepo
annex_path = Path(annex_loc)
if not annex_path.exists():
msg = f'no repository annex found at {annex_path}'
raise ValueError(msg)
super().__init__(annex_path.parent)


class Annex(AnnexBase):
""" """

def __init__(self, path: Path):
"""
The given ``path`` must point to a Git repository worktree
and is used to resolve and confirm the presence of an annex.
"""
try:
annex_loc, worktree_loc = call_git_lines(
[
'-C',
str(path),
'rev-parse',
'--path-format=absolute',
'--git-path',
'annex',
'--show-toplevel',
],
)
except CommandError as e:
msg = f'cannot resolve paths for a worktree with an annex at {path}'
raise ValueError(msg) from e
# this simple test is also what is done in legacy AnnexRepo
annex_path = Path(annex_loc)
if not annex_path.exists():
msg = f'no repository annex found at {annex_path}'
raise ValueError(msg)
super().__init__(Path(worktree_loc))
71 changes: 70 additions & 1 deletion datalad_core/repo/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
LocalGitConfig,
get_manager,
)
from datalad_core.repo.annex import BareRepoAnnex
from datalad_core.repo.gitmanaged import GitManaged
from datalad_core.runners import call_git
from datalad_core.runners import (
call_annex_json_lines,
call_git,
)


class Repo(GitManaged):
Expand All @@ -45,6 +49,7 @@ def __init__(self, path: Path):
def reset(self) -> None:
super().reset()
self._config: ConfigManager | None = None
self._annex: BareRepoAnnex | None = None

@property
def config(self) -> ConfigManager:
Expand Down Expand Up @@ -87,6 +92,52 @@ def config(self) -> ConfigManager:
self._config = lman
return self._config

def init_annex(
self,
description: str | None = None,
*,
autoenable_remotes: bool = True,
) -> BareRepoAnnex:
""" """
if self.config.get('core.bare', False).value is False:
msg = (
'Cannot initialize annex in a non-bare repository, '
'use Worktree.init_annex()'
)
raise TypeError(msg)
self._init_annex(
self.path,
description=description,
autoenable_remotes=autoenable_remotes,
)
return self.bare_annex

# we name this "bare_annex" not just "annex", even though it is clunky,
# to avoid the confusions associated with "but it has an annex, it is
# just not a bare respoitory"
@property
def bare_annex(self) -> BareRepoAnnex | None:
"""Handler for a bare repository's annex
If there is no initialized annex, or the repository is not bare,
this will be ``None``.
To get a handler for a non-bare repository's annex use
:attr:`Worktree.annex`.
"""
if self.config.get('core.bare', False).value is False:
return None
if self._annex is None:
try:
self._annex = BareRepoAnnex(self.path)
except ValueError:
# resetting it to None means that we will keep trying to
# locate an annex each time. I believe this is a sensible
# behavior. A once-present annex is unlikely to go away,
# but an annex could be initialized at any time
self._annex = None
return self._annex

@classmethod
def init_at(cls, path: Path) -> Repo:
"""Initialize a bare repository in an existing directory
Expand All @@ -102,3 +153,21 @@ def init_at(cls, path: Path) -> Repo:
capture_output=True,
)
return cls(path)

def _init_annex(
self,
exec_path: Path,
*,
description: str | None = None,
autoenable_remotes: bool = True,
) -> None:
""" """
cmd = ['init']
if not autoenable_remotes:
# no, we do not set --autoenable, this is a RepoAnnex feature
cmd.append('--no-autoenable')
if description is not None:
cmd.append(description)
# collect all items, we only expect a single one
# TODO: consume()?
list(call_annex_json_lines(cmd, cwd=exec_path))
116 changes: 116 additions & 0 deletions datalad_core/repo/tests/test_annex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import pytest

from datalad_core.runners import (
CommandError,
)

from ..annex import (
Annex,
BareRepoAnnex,
)
from ..repo import Repo
from ..worktree import Worktree


def test_barerepoannex_error(baregitrepo):
with pytest.raises(ValueError, match='no repository annex found'):
BareRepoAnnex(baregitrepo)


def test_barerepoannex_nonbare(annexrepo):
with pytest.raises(ValueError, match='not a bare repository'):
BareRepoAnnex(annexrepo)

wt = Worktree(annexrepo)
with pytest.raises(TypeError, match='Cannot initialize annex in a non-bare repo'):
wt.repo.init_annex()

assert wt.repo.bare_annex is None


# # check annex location resolution. it should be fairly robust and
# # pretty much always find the annex, as long as the path points
# # anywhere inside a git repo
# annex0 = BareRepoAnnex(annexrepo)
# annex1 = BareRepoAnnex(annexrepo / '.git')
# annex2 = BareRepoAnnex(annexrepo / '.git' / 'annex')
# testdir = annexrepo / 'somedir'
# testdir.mkdir()
# annex3 = BareRepoAnnex(testdir)
# assert annex0.path == annex1.path == annex2.path == annex3.path


def test_annex_error(annexrepo):
with pytest.raises(ValueError, match='cannot resolve path'):
Annex(Worktree(annexrepo).git_dir)


def test_annex_noannex(gitrepo):
with pytest.raises(ValueError, match='no repository annex found'):
Annex(gitrepo)

wt = Worktree(gitrepo)
assert wt.annex is None
# and it keeps it that way on repeated trials
assert wt.annex is None


def test_annex(annexrepo):
annex0 = Annex(annexrepo)
# initialization is robust to "anywhere in repo"
testdir = annexrepo / 'somedir'
testdir.mkdir()
annex1 = Annex(testdir)
assert annex0.path == annex1.path == annexrepo
assert annex0.uuid


def test_repo_init_annex_error(baregitrepo):
repo = Repo(baregitrepo)
# we take the place of the annex
(repo.path / 'annex').touch()
with pytest.raises(CommandError, match='fileExist'):
repo.init_annex()


def test_worktree_init_annex(gitrepo):
wt = Worktree(gitrepo)
annex = wt.init_annex()
assert wt.annex is annex
assert wt.annex.uuid


def test_repo_init_annex(baregitrepo):
repo = Repo(baregitrepo)
assert repo.bare_annex is None

# setting the flag has no effect here, it just exercises the
# code path
annex = repo.init_annex('testannex', autoenable_remotes=False)
assert annex is repo.bare_annex
assert repo.bare_annex.uuid
# ask again to execise cached code path
assert repo.bare_annex.uuid


def test_relocate_repo_w_annex(tmp_path):
wt_dir = tmp_path / 'wt'
orig_repo_dir = tmp_path / 'orig_repo'
new_repo_dir = tmp_path / 'new_repo'

wt_dir.mkdir()

wt = Worktree.init_at(wt_dir, gitdir=orig_repo_dir)
wt.init_annex()
assert wt.repo.path == orig_repo_dir
# annex commands run in the context of the worktree, not the repo
assert wt.annex.path == wt.path
annexes = wt.annex.list_annexes()
assert len(annexes) > 1

wt_new = Worktree.init_at(wt_dir, gitdir=new_repo_dir)
assert wt_new.repo.path == new_repo_dir
assert (new_repo_dir / 'annex').is_dir()
assert wt_new.annex.path == wt.path
# running annex commands continues to work after relocation
assert annexes == wt_new.annex.list_annexes()
30 changes: 30 additions & 0 deletions datalad_core/repo/worktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ConfigManager,
WorktreeGitConfig,
)
from datalad_core.repo.annex import Annex
from datalad_core.repo.gitmanaged import GitManaged
from datalad_core.repo.repo import Repo
from datalad_core.runners import call_git
Expand All @@ -38,6 +39,7 @@ def __init__(

def reset(self) -> None:
super().reset()
self._annex: Annex | None = None
self._config: ConfigManager | None = None
self._repo: Repo | None = None

Expand Down Expand Up @@ -125,6 +127,34 @@ def repo(self) -> Repo:
self._repo = Repo(self.git_common_dir)
return self._repo

def init_annex(
self,
description: str | None = None,
*,
autoenable_remotes: bool = True,
) -> Annex:
""" """
# refuse for non-bare
self.repo._init_annex( # noqa: SLF001
self.path,
description=description,
autoenable_remotes=autoenable_remotes,
)
return self.annex

@property
def annex(self) -> Annex:
if self._annex is None:
try:
self._annex = Annex(self.path)
except ValueError:
# resetting it to None means that we will keep trying to
# locate an annex each time. I believe this is a sensible
# behavior. A once-present annex is unlikely to go away,
# but an annex could be initialized at any time
self._annex = None
return self._annex

@classmethod
def init_at(cls, path: Path, gitdir: Path | None = None) -> Worktree:
"""Initialize a worktree for a new/existing repository in a directory
Expand Down

0 comments on commit 25b04d9

Please sign in to comment.