From bececdac927160b5c7e883736d7cc79d5699ad0a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:33:30 +0200 Subject: [PATCH] Fix args of feature docstrings (#7103) * Fix args of feature docstrings * Align dtype arg description in docstrings * Align shape arg description in docstrings * Rename FeatureConnector to Feature * Add feature subsections in docs * Add LargeList to Features docstring and rephrase * Rephrase --- .../source/package_reference/main_classes.mdx | 12 ++++ src/datasets/features/features.py | 59 ++++++++++--------- src/datasets/features/translation.py | 4 +- 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 86257f624b4..f9700fd19be 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -211,18 +211,26 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.Features +### Scalar + [[autodoc]] datasets.Value [[autodoc]] datasets.ClassLabel +### Composite + [[autodoc]] datasets.LargeList [[autodoc]] datasets.Sequence +### Translation + [[autodoc]] datasets.Translation [[autodoc]] datasets.TranslationVariableLanguages +### Arrays + [[autodoc]] datasets.Array2D [[autodoc]] datasets.Array3D @@ -231,8 +239,12 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.Array5D +### Audio + [[autodoc]] datasets.Audio +### Image + [[autodoc]] datasets.Image ## Filesystems diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index dc7c0f8c850..4f0a75c4753 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -460,8 +460,9 @@ def cast_to_python_objects(obj: Any, only_1d_for_numpy=False, optimize_list_cast @dataclass class Value: """ - The `Value` dtypes are as follows: + Scalar feature value of a particular data type. + The possible dtypes of `Value` are as follows: - `null` - `bool` - `int8` @@ -489,6 +490,10 @@ class Value: - `string` - `large_string` + Args: + dtype (`str`): + Name of the data type. + Example: ```py @@ -546,9 +551,9 @@ class Array2D(_ArrayXD): Args: shape (`tuple`): - The size of each dimension. + Size of each dimension. dtype (`str`): - The value of the data type. + Name of the data type. Example: @@ -571,9 +576,9 @@ class Array3D(_ArrayXD): Args: shape (`tuple`): - The size of each dimension. + Size of each dimension. dtype (`str`): - The value of the data type. + Name of the data type. Example: @@ -596,9 +601,9 @@ class Array4D(_ArrayXD): Args: shape (`tuple`): - The size of each dimension. + Size of each dimension. dtype (`str`): - The value of the data type. + Name of the data type. Example: @@ -621,9 +626,9 @@ class Array5D(_ArrayXD): Args: shape (`tuple`): - The size of each dimension. + Size of each dimension. dtype (`str`): - The value of the data type. + Name of the data type. Example: @@ -1139,7 +1144,7 @@ class Sequence: Mostly here for compatiblity with tfds. Args: - feature: + feature ([`FeatureType`]): A list of features of a single type or a dictionary of types. length (`int`): Length of the sequence. @@ -1170,7 +1175,7 @@ class LargeList: It is backed by `pyarrow.LargeListType`, which is like `pyarrow.ListType` but with 64-bit rather than 32-bit offsets. Args: - dtype: + dtype ([`FeatureType`]): Child feature data type of each item within the large list. """ @@ -1695,30 +1700,30 @@ class Features(dict): and values are the type of that column. `FieldType` can be one of the following: - - a [`~datasets.Value`] feature specifies a single typed value, e.g. `int64` or `string`. - - a [`~datasets.ClassLabel`] feature specifies a field with a predefined set of classes which can have labels - associated to them and will be stored as integers in the dataset. - - a python `dict` which specifies that the field is a nested field containing a mapping of sub-fields to sub-fields - features. It's possible to have nested fields of nested fields in an arbitrary manner. - - a python `list` or a [`~datasets.Sequence`] specifies that the field contains a list of objects. The python - `list` or [`~datasets.Sequence`] should be provided with a single sub-feature as an example of the feature - type hosted in this list. + - [`Value`] feature specifies a single data type value, e.g. `int64` or `string`. + - [`ClassLabel`] feature specifies a predefined set of classes which can have labels associated to them and + will be stored as integers in the dataset. + - Python `dict` specifies a composite feature containing a mapping of sub-fields to sub-features. + It's possible to have nested fields of nested fields in an arbitrary manner. + - Python `list`, [`LargeList`] or [`Sequence`] specifies a composite feature containing a sequence of + sub-features, all of the same feature type. - A [`~datasets.Sequence`] with a internal dictionary feature will be automatically converted into a dictionary of + A [`Sequence`] with an internal dictionary feature will be automatically converted into a dictionary of lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be - un-wanted in some cases. If you don't want this behavior, you can use a python `list` instead of the - [`~datasets.Sequence`]. + un-wanted in some cases. If you don't want this behavior, you can use a Python `list` or a [`LargeList`] + instead of the [`Sequence`]. - - a [`Array2D`], [`Array3D`], [`Array4D`] or [`Array5D`] feature for multidimensional arrays. - - an [`Audio`] feature to store the absolute path to an audio file or a dictionary with the relative path + - [`Array2D`], [`Array3D`], [`Array4D`] or [`Array5D`] feature for multidimensional arrays. + - [`Audio`] feature to store the absolute path to an audio file or a dictionary with the relative path to an audio file ("path" key) and its bytes content ("bytes" key). This feature extracts the audio data. - - an [`Image`] feature to store the absolute path to an image file, an `np.ndarray` object, a `PIL.Image.Image` object - or a dictionary with the relative path to an image file ("path" key) and its bytes content ("bytes" key). This feature extracts the image data. - - [`~datasets.Translation`] and [`~datasets.TranslationVariableLanguages`], the two features specific to Machine Translation. + - [`Image`] feature to store the absolute path to an image file, an `np.ndarray` object, a `PIL.Image.Image` object + or a dictionary with the relative path to an image file ("path" key) and its bytes content ("bytes" key). + This feature extracts the image data. + - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. """ def __init__(*args, **kwargs): diff --git a/src/datasets/features/translation.py b/src/datasets/features/translation.py index 1d3eb1af4bb..584bf3186c3 100644 --- a/src/datasets/features/translation.py +++ b/src/datasets/features/translation.py @@ -10,7 +10,7 @@ @dataclass class Translation: - """`FeatureConnector` for translations with fixed languages per example. + """`Feature` for translations with fixed languages per example. Here for compatiblity with tfds. Args: @@ -50,7 +50,7 @@ def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: @dataclass class TranslationVariableLanguages: - """`FeatureConnector` for translations with variable languages per example. + """`Feature` for translations with variable languages per example. Here for compatiblity with tfds. Args: