Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial annex support #24

Merged
merged 2 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@

__all__ = [
'cfgman',
'bareannexrepo',
'baregitrepo',
'annexrepo',
'gitrepo',
'verify_pristine_gitconfig_global',
]


from datalad_core.tests.fixtures import (
# function-scope temporary Git repo with an initialized annex
annexrepo,
# function-scope temporary, bare Git repo with an initialized annex
bareannexrepo,
# function-scope temporary, bare Git repo
baregitrepo,
# function-scope config manager
Expand Down
30 changes: 27 additions & 3 deletions datalad_core/repo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,45 @@
"""Repository and worktree representations

The classes in this module implement the "flyweight" pattern. This means that,
within the same process, creating instances of :class:`Repo` and
:class:`Worktree` always yields the same instance for the same path location.
This module provides the essential types for working with Git/git-annex
repositories and DataLad datasets.

For most use cases, the :class:`Worktree` class is the main entrypoint. It can
be pointed to an existing Git repository checkout, or a new repository can be
created via its :meth:`Worktree.init_at` class method. Access to operations on
the underlying Git repository (that may be shared by additional worktrees) is
possible via the :attr:`Worktree.repo` attribute. An optional (git-)annex can
be initialized (:meth:`Worktree.init_annex`), and accessed via
:attr:`Worktree.annex`.

Working with bare Git repositories is supported by using the :class:`Repo`
class directly. Operations on an annex of a bare repository are accessible
via the :attr:`Repo.bare_annex` attribute.

The :class:`Repo` and :class:`Worktree` classes in this module implement the
"flyweight" pattern. This means that, within the same process, creating
instances of :class:`Repo` and :class:`Worktree` always yields the same
instance for the same path location.

.. currentmodule:: datalad_core.repo
.. autosummary::
:toctree: generated

Repo
Worktree
Annex
BareRepoAnnex
"""

__all__ = [
'Repo',
'Worktree',
'Annex',
'BareRepoAnnex',
]

from .annex import (
Annex,
BareRepoAnnex,
)
from .repo import Repo
from .worktree import Worktree
149 changes: 149 additions & 0 deletions datalad_core/repo/annex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from __future__ import annotations

from pathlib import Path

from datalad_core.runners import (
CommandError,
call_annex_json_lines,
call_git_lines,
)


class AnnexBase:
"""Base class for annex handler

This class should not be used directyly, but any of its subclasses.

Methods of this class implement operations that work with annexes
in bare and non-bare repositories.
"""

def __init__(self, path: Path):
self._path = path
self._uuid: str | None = None

@property
def path(self) -> Path:
"""Path used by git-annex commands as PWD"""
return self._path

# not using a cached_property, because it would not prevent write-access
@property
def uuid(self) -> str:
"""UUID identifier of the local ("here") annex

Once accessed, the value is cached for the lifetime of the instance.
"""

if self._uuid is None:
for ax in self.list_annexes():
if ax['here'] is True:
self._uuid = str(ax['uuid'])
return self._uuid
if self._uuid is None: # pragma: no cover
msg = 'annex unexpectedly has no UUID'
raise RuntimeError(msg)
return self._uuid

def list_annexes(self) -> list[dict[str, str | bool]]:
"""Returns a list with information on known annexed for a repository

Each item in the list corresponds to an annex and is a dictionary
with annex properties. Dictionary ``str`` keys are

- ``uuid``: annex UUID identifier
- ``description``: a custom or generated description of that annex
- ``here``: a boolean flag whether that annex is local to this
repository
- ``trust``: a label (``trusted``, ``semitrusted``, ``untrusted``)
indication the trust assignment for that annex
"""
res = list(call_annex_json_lines(['info', '--fast'], cwd=self.path))
if len(res) != 1: # pragma: no cover
msg = 'unexpected output from git-annex-info'
raise RuntimeError(msg)
info = res[0]
annexes: list[dict[str, str | bool]] = []
for rt in (
'untrusted repositories',
'semitrusted repositories',
'trusted repositories',
):
for r in info[rt]:
# TODO: make ENUM?
r.update(trust=rt.split(' ')[0])
annexes.append(r)
return annexes


class BareRepoAnnex(AnnexBase):
# ATTN: This class should not get (many) additional methods. Instead, they
# should all go into AnnexBase, if they can work with bare and non-bare
# repositories. The purpose of this class is mainly to enforce use with a
# bare repository in its __init__()
"""Interface for an annex in a bare Git repository

Annex operations are executed in the context of a Git repository.
This repository must be a bare repository. An exception is raised,
if this class is used with a non-bare repository.
"""

def __init__(self, path: Path):
"""
The given ``path`` must point to a bare Git repository and is used
to resolve and confirm the presence of an annex.
"""
bare, annex_loc = call_git_lines(
[
'-C',
str(path),
'rev-parse',
'--path-format=absolute',
'--is-bare-repository',
'--git-path',
'annex',
],
)
if bare != 'true':
msg = f'not a bare repository at {path}'
raise ValueError(msg)
# this simple test is also what is done in legacy AnnexRepo
annex_path = Path(annex_loc)
if not annex_path.exists():
msg = f'no repository annex found at {annex_path}'
raise ValueError(msg)
super().__init__(annex_path.parent)


class Annex(AnnexBase):
"""Interface for an annex in a non-bare Git repository

Annex operations are executed in the context of a Git worktree.
"""

def __init__(self, path: Path):
"""
The given ``path`` must point to a Git repository worktree
and is used to resolve and confirm the presence of an annex.
"""
try:
annex_loc, worktree_loc = call_git_lines(
[
'-C',
str(path),
'rev-parse',
'--path-format=absolute',
'--git-path',
'annex',
'--show-toplevel',
],
)
except CommandError as e:
msg = f'cannot resolve paths for a worktree with an annex at {path}'
raise ValueError(msg) from e
# this simple test is also what is done in legacy AnnexRepo
annex_path = Path(annex_loc)
if not annex_path.exists():
msg = f'no repository annex found at {annex_path}'
raise ValueError(msg)
super().__init__(Path(worktree_loc))
53 changes: 53 additions & 0 deletions datalad_core/repo/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
LocalGitConfig,
get_manager,
)
from datalad_core.repo.annex import BareRepoAnnex
from datalad_core.repo.gitmanaged import GitManaged
from datalad_core.repo.utils import init_annex_at
from datalad_core.runners import call_git


Expand All @@ -45,6 +47,7 @@ def __init__(self, path: Path):
def reset(self) -> None:
super().reset()
self._config: ConfigManager | None = None
self._annex: BareRepoAnnex | None = None

@property
def config(self) -> ConfigManager:
Expand Down Expand Up @@ -87,6 +90,56 @@ def config(self) -> ConfigManager:
self._config = lman
return self._config

def init_annex(
self,
description: str | None = None,
*,
autoenable_remotes: bool = True,
) -> BareRepoAnnex:
""" """
if self.config.get('core.bare', False).value is False:
msg = (
'Cannot initialize annex in a non-bare repository, '
'use Worktree.init_annex()'
)
raise TypeError(msg)
init_annex_at(
self.path,
description=description,
autoenable_remotes=autoenable_remotes,
)
annex = self.bare_annex
if annex is None: # pragma: no cover
msg = 'could not initialize annex unexpectedly'
raise RuntimeError(msg)
return annex

# we name this "bare_annex" not just "annex", even though it is clunky,
# to avoid the confusions associated with "but it has an annex, it is
# just not a bare respoitory"
@property
def bare_annex(self) -> BareRepoAnnex | None:
"""Handler for a bare repository's annex

If there is no initialized annex, or the repository is not bare,
this will be ``None``.

To get a handler for a non-bare repository's annex use
:attr:`Worktree.annex`.
"""
if self.config.get('core.bare', False).value is False:
return None
if self._annex is None:
try:
self._annex = BareRepoAnnex(self.path)
except ValueError:
# resetting it to None means that we will keep trying to
# locate an annex each time. I believe this is a sensible
# behavior. A once-present annex is unlikely to go away,
# but an annex could be initialized at any time
self._annex = None
return self._annex

@classmethod
def init_at(cls, path: Path) -> Repo:
"""Initialize a bare repository in an existing directory
Expand Down
104 changes: 104 additions & 0 deletions datalad_core/repo/tests/test_annex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pytest

from datalad_core.runners import (
CommandError,
)

from ..annex import (
Annex,
BareRepoAnnex,
)
from ..repo import Repo
from ..worktree import Worktree


def test_barerepoannex_error(baregitrepo):
with pytest.raises(ValueError, match='no repository annex found'):
BareRepoAnnex(baregitrepo)


def test_barerepoannex_nonbare(annexrepo):
with pytest.raises(ValueError, match='not a bare repository'):
BareRepoAnnex(annexrepo)

wt = Worktree(annexrepo)
with pytest.raises(TypeError, match='Cannot initialize annex in a non-bare repo'):
wt.repo.init_annex()

assert wt.repo.bare_annex is None


def test_annex_error(annexrepo):
with pytest.raises(ValueError, match='cannot resolve path'):
Annex(Worktree(annexrepo).git_dir)


def test_annex_noannex(gitrepo):
with pytest.raises(ValueError, match='no repository annex found'):
Annex(gitrepo)

wt = Worktree(gitrepo)
assert wt.annex is None
# and it keeps it that way on repeated trials
assert wt.annex is None


def test_annex(annexrepo):
annex0 = Annex(annexrepo)
# initialization is robust to "anywhere in repo"
testdir = annexrepo / 'somedir'
testdir.mkdir()
annex1 = Annex(testdir)
assert annex0.path == annex1.path == annexrepo
assert annex0.uuid


def test_repo_init_annex_error(baregitrepo):
repo = Repo(baregitrepo)
# we take the place of the annex
(repo.path / 'annex').touch()
with pytest.raises(CommandError, match='fileExist|file already exists'):
repo.init_annex()


def test_worktree_init_annex(gitrepo):
wt = Worktree(gitrepo)
annex = wt.init_annex()
assert wt.annex is annex
assert wt.annex.uuid


def test_repo_init_annex(baregitrepo):
repo = Repo(baregitrepo)
assert repo.bare_annex is None

# setting the flag has no effect here, it just exercises the
# code path
annex = repo.init_annex('testannex', autoenable_remotes=False)
assert annex is repo.bare_annex
assert repo.bare_annex.uuid
# ask again to exercise cached code path
assert repo.bare_annex.uuid


def test_relocate_repo_w_annex(tmp_path):
wt_dir = tmp_path / 'wt'
orig_repo_dir = tmp_path / 'orig_repo'
new_repo_dir = tmp_path / 'new_repo'

wt_dir.mkdir()

wt = Worktree.init_at(wt_dir, gitdir=orig_repo_dir)
wt.init_annex()
assert wt.repo.path == orig_repo_dir
# annex commands run in the context of the worktree, not the repo
assert wt.annex.path == wt.path
annexes = wt.annex.list_annexes()
assert len(annexes) > 1

wt_new = Worktree.init_at(wt_dir, gitdir=new_repo_dir)
assert wt_new.repo.path == new_repo_dir
assert (new_repo_dir / 'annex').is_dir()
assert wt_new.annex.path == wt.path
# running annex commands continues to work after relocation
assert annexes == wt_new.annex.list_annexes()
Loading