diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py index 1dd4138..b75d1a4 100644 --- a/lib/ncdata/utils/__init__.py +++ b/lib/ncdata/utils/__init__.py @@ -1,5 +1,7 @@ """General user utility functions.""" +from ._copy import ncdata_copy +from ._dim_indexing import Slicer, index_by_dimensions from ._save_errors import save_errors -__all__ = ["save_errors"] +__all__ = ["Slicer", "index_by_dimensions", "ncdata_copy", "save_errors"] diff --git a/lib/ncdata/utils/_copy.py b/lib/ncdata/utils/_copy.py new file mode 100644 index 0000000..d196d6b --- /dev/null +++ b/lib/ncdata/utils/_copy.py @@ -0,0 +1,52 @@ +"""Utility to copy NcData objects, but not copying any contained data arrays.""" + +from ncdata import NameMap, NcAttribute, NcData, NcDimension, NcVariable + + +def _attributes_copy(attrs: NameMap) -> NameMap: + return NameMap.from_items( + [ + NcAttribute(name=attr.name, value=attr.value) + for attr in attrs.values() + ], + item_type=NcAttribute, + ) + + +def ncdata_copy(ncdata: NcData) -> NcData: + """ + Return a copy of the data. + + The operation makes fresh copies of all ncdata objects, but does not copy arrays in + either variable data attribute values. + + Parameters + ---------- + ncdata + data to copy + + Returns + ------- + ncdata + identical but distinct copy of input + + """ + return NcData( + attributes=_attributes_copy(ncdata.attributes), + dimensions=[ + NcDimension(dim.name, size=dim.size, unlimited=dim.unlimited) + for dim in ncdata.dimensions.values() + ], + variables=[ + NcVariable( + name=var.name, + dimensions=var.dimensions, + dtype=var.dtype, + data=var.data, + attributes=_attributes_copy(var.attributes), + group=var.group, + ) + for var in ncdata.variables.values() + ], + groups=[ncdata_copy(group) for group in ncdata.groups.values()], + ) diff --git a/lib/ncdata/utils/_dim_indexing.py b/lib/ncdata/utils/_dim_indexing.py new file mode 100644 index 0000000..d6947df --- /dev/null +++ b/lib/ncdata/utils/_dim_indexing.py @@ -0,0 +1,161 @@ +from numbers import Number +from typing import Any, List, Mapping, Union + +import dask.array as da + +from ncdata import NcData +from ncdata.utils import ncdata_copy + + +def index_by_dimensions( + ncdata: NcData, **dim_index_kwargs: Mapping[str, Any] +) -> NcData: + """ + Index an NcData over dimensions. + + Parameters + ---------- + ncdata + input data + dim_index_kwargs + specify indexing to apply to dimensions. + E.G. ``x=1``, ``time=slice(0, 100)``, ``levels=[1,2,5]``. + + Returns + ------- + copy of input with dimensions, and all relevant variables, sub-indexed. + + Notes + ----- + Where a dimension key is a single value, the dimension will be *removed*. + This mimics how numpy arrays behave, i.e. the difference between a[1] and a[1:2] + + Examples + -------- + ncdata = index_by_dimensions(ncdata, time=slice(0, 10)) # equivalent to [:10] + ncdata = index_by_dimensions(ncdata, levels=[1,2,5]) + ncdata = index_by_dimensions(ncdata, time=3, levels=slice(2, 10, 3)) + + See Also + -------- + :class:`Slicer` provides the same function with a slicing syntax + """ + # Start by copying the input : then modify that in-place + ncdata = ncdata_copy(ncdata) + for dim_name, key in dim_index_kwargs: + # Dimension names must occur in the ncdata. + dimension = ncdata.dimensions.get(dim_name) + if dimension is None: + raise ValueError( + f"Dimension {dim_name!r} is not present in 'ncdata'." + ) + + # Check for and fail repeated dimensions: the meaning would be unclear (!) + matches = [name for name in dim_index_kwargs if name == dim_name] + if len(matches) > 1: + msg = ( + f"Dimensions to index, {tuple(dim_index_kwargs.keys())}, " + f"includes dimension {dim_name!r} more than once." + ) + raise ValueError(msg) + + # Hopefully this replicates how numpy makes this decision? + remove_dim = isinstance(key, Number) + + # TODO: + # Key types must be supported: + # * int (or other numeric, including numpy scalars ?) + # * list of int + # * slice object + # * 1-D array of numeric + # Key "special" types we could possibly error or convert, to avoid confusion + # with numpy behaviours ? : + # arrays, tuples, booleans, None, newaxis, ellipsis ... + + # Index the data of all referencing variables + for var in ncdata.variables.values(): + if dim_name in var.dimensions: + # construct a list of slice objects + (i_slicedim,) = [ + i + for i, name in enumerate(var.dimensions) + if name == dim_name + ] + slices = [slice(None) for dim in var.dimensions] + slices[i_slicedim] = key + + # index the data + var.data = var.data[tuple(slices)] + + # also remove the dim, if it will be removed + if remove_dim: + del var.dimensions[dim_name] + + # Remove or reduce the dimension itself. + if remove_dim: + del ncdata.dimensions[dim_name] + else: + # calculate the new dim size, using numpy-like logic + # TODO: there is probably a better way of calculating this ? + new_size = da.zeros(dimension.size)[key].shape[0] + dimension.size = new_size + + return ncdata + + +class Slicer: + """ + An object which can index an NcData over its dimensions. + + This wraps the :meth:`index_by_dimensions` method for convenience, returning an + object which supports the Python extended slicing syntax. + + Examples + -------- + data = Slicer(ncdata, 'time')[:10] + data = Slicer(ncdata, 'level')[[1, 2, 5]] + data = Slicer(ncdata, 'level', 'time', 'x', 'y')[1, :3, 2:10:3, ::-1] + """ + + def __init__(self, ncdata: NcData, dimensions: Union[str, List[str]]): + """ + Create an indexer for an NcData, applying to specific dimensions. + + This can then be indexed to produce a derived (sub-indexed) dataset. + + Parameters + ---------- + ncdata + input data + dimensions + one or more dimension names, to which successive index keys will be applied + """ + self.ncdata = ncdata + if isinstance(dimensions, str): + dimensions = [dimensions] + self.dim_names = tuple(dimensions) + + def __getitem__(self, keys) -> NcData: + """ + Return an indexed portion of self.ncdata. + + Index with 'keys' in the specified dimensions. + """ + if not isinstance(keys, tuple): + # Single key, e.g. 1, slice(None), [2,3,4], array([2,3]) + # N.B. *otherwise* keys is always a tuple + # A single tuple argument is passed as-is, i.e. interprets as multiple keys + keys = [keys] + + n_keys = len(keys) + if len(keys) > self.dim_names: + msg = ( + f"Too many index keys, {n_keys}, for the specified indexing dimension " + "names, {self.dim_names!r}." + ) + raise ValueError(msg) + + # NB too *few* keys is not a problem, since 'zip' truncates for us. + dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)} + + return index_by_dimensions(self.ncdata, **dim_kwargs)