diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index 4c62ea5c6a5..d8f11e3c10b 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -24,7 +24,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.22.0,<1.0.0 + - huggingface_hub >=0.23.0,<1.0.0 - packaging - aiohttp run: @@ -41,7 +41,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.22.0,<1.0.0 + - huggingface_hub >=0.23.0,<1.0.0 - packaging - aiohttp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 383c96332c8..2951be28289 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,7 +62,7 @@ jobs: run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: uv pip install --system pyarrow==15.0.0 huggingface-hub==0.22.0 transformers dill==0.3.1.1 + run: uv pip install --system pyarrow==15.0.0 huggingface-hub==0.23.5 transformers dill==0.3.1.1 - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ diff --git a/docs/source/image_dataset.mdx b/docs/source/image_dataset.mdx index 8dd9ac2c1fa..43559cbd2d7 100644 --- a/docs/source/image_dataset.mdx +++ b/docs/source/image_dataset.mdx @@ -2,7 +2,7 @@ There are two methods for creating and sharing an image dataset. This guide will show you how to: -* Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. +* Create an image dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. * Create an image dataset with `ImageFolder` and some metadata. This is a no-code solution for quickly creating an image dataset with several thousand images. diff --git a/setup.py b/setup.py index 10e6f5cda08..3901221c39a 100644 --- a/setup.py +++ b/setup.py @@ -133,7 +133,7 @@ # for data streaming via http "aiohttp", # To get datasets from the Datasets Hub on huggingface.co - "huggingface-hub>=0.22.0", + "huggingface-hub>=0.23.0", # Utilities from PyPA to e.g., compare versions "packaging", # To parse YAML metadata from dataset cards @@ -235,7 +235,7 @@ setup( name="datasets", - version="3.0.2.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="3.0.3.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 7bbb2cd7666..f72cab6c89f 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.0.2.dev0" +__version__ = "3.0.3.dev0" from .arrow_dataset import Dataset from .arrow_reader import ReadInstruction diff --git a/src/datasets/download/download_manager.py b/src/datasets/download/download_manager.py index 6ccb4f9d1c9..a8dff37aeef 100644 --- a/src/datasets/download/download_manager.py +++ b/src/datasets/download/download_manager.py @@ -189,7 +189,11 @@ def _download_batched( download_func = partial(self._download_single, download_config=download_config) fs: fsspec.AbstractFileSystem - fs, path = url_to_fs(url_or_filenames[0], **download_config.storage_options) + path = str(url_or_filenames[0]) + if is_relative_path(path): + # append the relative path to the base_path + path = url_or_path_join(self._base_path, path) + fs, path = url_to_fs(path, **download_config.storage_options) size = 0 try: size = fs.info(path).get("size", 0) diff --git a/src/datasets/load.py b/src/datasets/load.py index 458b917c4f5..0faf2fd5cb5 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -36,10 +36,18 @@ import requests import yaml from fsspec.core import url_to_fs -from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem -from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError, get_session +from huggingface_hub import DatasetCard, DatasetCardData, HfApi +from huggingface_hub.utils import ( + EntryNotFoundError, + GatedRepoError, + LocalEntryNotFoundError, + OfflineModeIsEnabled, + RepositoryNotFoundError, + RevisionNotFoundError, + get_session, +) -from . import config +from . import __version__, config from .arrow_dataset import Dataset from .builder import BuilderConfig, DatasetBuilder from .data_files import ( @@ -73,7 +81,6 @@ from .splits import Split from .utils import _dataset_viewer from .utils.file_utils import ( - OfflineModeIsEnabled, _raise_if_offline_mode_is_enabled, cached_path, get_datasets_user_agent, @@ -82,7 +89,7 @@ relative_to_absolute_path, url_or_path_join, ) -from .utils.hub import check_auth, hf_dataset_url +from .utils.hub import hf_dataset_url from .utils.info_utils import VerificationMode, is_small_dataset from .utils.logging import get_logger from .utils.metadata import MetadataConfigs @@ -974,49 +981,48 @@ class HubDatasetModuleFactoryWithoutScript(_DatasetModuleFactory): def __init__( self, name: str, - revision: Optional[Union[str, Version]] = None, + commit_hash: str, data_dir: Optional[str] = None, data_files: Optional[Union[str, List, Dict]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, + use_exported_dataset_infos: bool = False, ): self.name = name - self.revision = revision + self.commit_hash = commit_hash self.data_files = data_files self.data_dir = data_dir self.download_config = download_config or DownloadConfig() self.download_mode = download_mode + self.use_exported_dataset_infos = use_exported_dataset_infos increase_load_count(name) def get_module(self) -> DatasetModule: - hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( - self.name, - revision=self.revision, + # Get the Dataset Card and fix the revision in case there are new commits in the meantime + api = HfApi( + endpoint=config.HF_ENDPOINT, token=self.download_config.token, - timeout=100.0, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(self.download_config.user_agent), ) - # even if metadata_configs is not None (which means that we will resolve files for each config later) - # we cannot skip resolving all files because we need to infer module name by files extensions - revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime - base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip("/") - - download_config = self.download_config.copy() - if download_config.download_desc is None: - download_config.download_desc = "Downloading readme" try: - dataset_readme_path = cached_path( - hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision), - download_config=download_config, + dataset_readme_path = api.hf_hub_download( + repo_id=self.name, + filename=config.REPOCARD_FILENAME, + repo_type="dataset", + revision=self.commit_hash, + proxies=self.download_config.proxies, ) - dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data - except FileNotFoundError: + dataset_card_data = DatasetCard.load(dataset_readme_path).data + except EntryNotFoundError: dataset_card_data = DatasetCardData() download_config = self.download_config.copy() if download_config.download_desc is None: download_config.download_desc = "Downloading standalone yaml" try: standalone_yaml_path = cached_path( - hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision), + hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash), download_config=download_config, ) with open(standalone_yaml_path, "r", encoding="utf-8") as f: @@ -1027,17 +1033,13 @@ def get_module(self) -> DatasetModule: dataset_card_data = DatasetCardData(**_dataset_card_data_dict) except FileNotFoundError: pass + base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/") metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) - # Use the infos from the parquet export except in some cases: - if self.data_dir or self.data_files or (self.revision and self.revision != "main"): - use_exported_dataset_infos = False - else: - use_exported_dataset_infos = True - if config.USE_PARQUET_EXPORT and use_exported_dataset_infos: + if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos: try: exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos( - dataset=self.name, revision=self.revision, token=self.download_config.token + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token ) exported_dataset_infos = DatasetInfosDict( { @@ -1110,7 +1112,7 @@ def get_module(self) -> DatasetModule: ] default_config_name = None builder_kwargs = { - "base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"), "repo_id": self.name, "dataset_name": camelcase_to_snakecase(Path(self.name).name), } @@ -1122,7 +1124,7 @@ def get_module(self) -> DatasetModule: try: # this file is deprecated and was created automatically in old versions of push_to_hub dataset_infos_path = cached_path( - hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision), + hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash), download_config=download_config, ) with open(dataset_infos_path, encoding="utf-8") as f: @@ -1143,10 +1145,9 @@ def get_module(self) -> DatasetModule: if default_config_name is None and len(dataset_infos) == 1: default_config_name = next(iter(dataset_infos)) - hash = revision return DatasetModule( module_path, - hash, + self.commit_hash, builder_kwargs, dataset_infos=dataset_infos, builder_configs_parameters=BuilderConfigsParameters( @@ -1165,20 +1166,20 @@ class HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory): def __init__( self, name: str, - revision: Optional[str] = None, + commit_hash: str, download_config: Optional[DownloadConfig] = None, ): self.name = name - self.revision = revision + self.commit_hash = commit_hash self.download_config = download_config or DownloadConfig() increase_load_count(name) def get_module(self) -> DatasetModule: exported_parquet_files = _dataset_viewer.get_exported_parquet_files( - dataset=self.name, revision=self.revision, token=self.download_config.token + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token ) exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos( - dataset=self.name, revision=self.revision, token=self.download_config.token + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token ) dataset_infos = DatasetInfosDict( { @@ -1186,15 +1187,26 @@ def get_module(self) -> DatasetModule: for config_name in exported_dataset_infos } ) - hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( - self.name, - revision="refs/convert/parquet", - token=self.download_config.token, - timeout=100.0, - ) - revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime + parquet_commit_hash = ( + HfApi( + endpoint=config.HF_ENDPOINT, + token=self.download_config.token, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(self.download_config.user_agent), + ) + .dataset_info( + self.name, + revision="refs/convert/parquet", + token=self.download_config.token, + timeout=100.0, + ) + .sha + ) # fix the revision in case there are new commits in the meantime metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos( - revision=revision, exported_parquet_files=exported_parquet_files, dataset_infos=dataset_infos + parquet_commit_hash=parquet_commit_hash, + exported_parquet_files=exported_parquet_files, + dataset_infos=dataset_infos, ) module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"] builder_configs, default_config_name = create_builder_configs_from_metadata_configs( @@ -1203,7 +1215,6 @@ def get_module(self) -> DatasetModule: supports_metadata=False, download_config=self.download_config, ) - hash = self.revision builder_kwargs = { "repo_id": self.name, "dataset_name": camelcase_to_snakecase(Path(self.name).name), @@ -1211,7 +1222,7 @@ def get_module(self) -> DatasetModule: return DatasetModule( module_path, - hash, + self.commit_hash, builder_kwargs, dataset_infos=dataset_infos, builder_configs_parameters=BuilderConfigsParameters( @@ -1231,14 +1242,14 @@ class HubDatasetModuleFactoryWithScript(_DatasetModuleFactory): def __init__( self, name: str, - revision: Optional[Union[str, Version]] = None, + commit_hash: str, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, dynamic_modules_path: Optional[str] = None, trust_remote_code: Optional[bool] = None, ): self.name = name - self.revision = revision + self.commit_hash = commit_hash self.download_config = download_config or DownloadConfig() self.download_mode = download_mode self.dynamic_modules_path = dynamic_modules_path @@ -1246,14 +1257,14 @@ def __init__( increase_load_count(name) def download_loading_script(self) -> str: - file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision) + file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.commit_hash) download_config = self.download_config.copy() if download_config.download_desc is None: download_config.download_desc = "Downloading builder script" return cached_path(file_path, download_config=download_config) def download_dataset_infos_file(self) -> str: - dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision) + dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1267,7 +1278,7 @@ def download_dataset_infos_file(self) -> str: return None def download_dataset_readme_file(self) -> str: - readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) + readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.commit_hash) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1296,7 +1307,7 @@ def get_module(self) -> DatasetModule: imports = get_imports(local_path) local_imports, library_imports = _download_additional_modules( name=self.name, - base_path=hf_dataset_url(self.name, "", revision=self.revision), + base_path=hf_dataset_url(self.name, "", revision=self.commit_hash), imports=imports, download_config=self.download_config, ) @@ -1343,7 +1354,7 @@ def get_module(self) -> DatasetModule: # make the new module to be noticed by the import system importlib.invalidate_caches() builder_kwargs = { - "base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"), "repo_id": self.name, } return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path) @@ -1574,46 +1585,74 @@ def dataset_module_factory( # Try remotely elif is_relative_path(path) and path.count("/") <= 1: try: - _raise_if_offline_mode_is_enabled() - hf_api = HfApi(config.HF_ENDPOINT) + # Get the Dataset Card + get the revision + check authentication all at in one call + # We fix the commit_hash in case there are new commits in the meantime + api = HfApi( + endpoint=config.HF_ENDPOINT, + token=download_config.token, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(download_config.user_agent), + ) try: - dataset_info = hf_api.dataset_info( + _raise_if_offline_mode_is_enabled() + dataset_readme_path = api.hf_hub_download( repo_id=path, + filename=config.REPOCARD_FILENAME, + repo_type="dataset", revision=revision, - token=download_config.token, - timeout=100.0, + proxies=download_config.proxies, ) + commit_hash = os.path.basename(os.path.dirname(dataset_readme_path)) + except LocalEntryNotFoundError as e: + if isinstance( + e.__cause__, + ( + OfflineModeIsEnabled, + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + ), + ): + raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + else: + raise + except EntryNotFoundError: + commit_hash = api.dataset_info( + path, + revision=revision, + timeout=100.0, + ).sha except ( OfflineModeIsEnabled, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError, ) as e: raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + except GatedRepoError as e: + message = f"Dataset '{path}' is a gated dataset on the Hub." + if e.response.status_code == 401: + message += " You must be authenticated to access it." + elif e.response.status_code == 403: + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e except RevisionNotFoundError as e: raise DatasetNotFoundError( f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." ) from e except RepositoryNotFoundError as e: raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e - if dataset_info.gated: - try: - check_auth(hf_api, repo_id=path, token=download_config.token) - except GatedRepoError as e: - message = f"Dataset '{path}' is a gated dataset on the Hub." - if "401 Client Error" in str(e): - message += " You must be authenticated to access it." - elif "403 Client Error" in str(e): - message += ( - f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." - ) - raise DatasetNotFoundError(message) from e - - if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script - fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token) + try: + dataset_script_path = api.hf_hub_download( + repo_id=path, + filename=filename, + repo_type="dataset", + revision=commit_hash, + proxies=download_config.proxies, + ) if _require_custom_configs or (revision and revision != "main"): can_load_config_from_parquet_export = False elif _require_default_config_name: - with fs.open(f"datasets/{path}/{filename}", "r", encoding="utf-8") as f: + with open(dataset_script_path, "r", encoding="utf-8") as f: can_load_config_from_parquet_export = "DEFAULT_CONFIG_NAME" not in f.read() else: can_load_config_from_parquet_export = True @@ -1622,29 +1661,48 @@ def dataset_module_factory( # This fails when the dataset has multiple configs and a default config and # the user didn't specify a configuration name (_require_default_config_name=True). try: - return HubDatasetModuleFactoryWithParquetExport( - path, download_config=download_config, revision=dataset_info.sha + out = HubDatasetModuleFactoryWithParquetExport( + path, download_config=download_config, commit_hash=commit_hash ).get_module() + logger.info("Loading the dataset from the Parquet export on Hugging Face.") + return out except _dataset_viewer.DatasetViewerError: pass # Otherwise we must use the dataset script if the user trusts it return HubDatasetModuleFactoryWithScript( path, - revision=revision, + commit_hash=commit_hash, download_config=download_config, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path, trust_remote_code=trust_remote_code, ).get_module() - else: + except EntryNotFoundError: + # Use the infos from the parquet export except in some cases: + if data_dir or data_files or (revision and revision != "main"): + use_exported_dataset_infos = False + else: + use_exported_dataset_infos = True return HubDatasetModuleFactoryWithoutScript( path, - revision=revision, + commit_hash=commit_hash, data_dir=data_dir, data_files=data_files, download_config=download_config, download_mode=download_mode, + use_exported_dataset_infos=use_exported_dataset_infos, ).get_module() + except GatedRepoError as e: + message = f"Dataset '{path}' is a gated dataset on the Hub." + if e.response.status_code == 401: + message += " You must be authenticated to access it." + elif e.response.status_code == 403: + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e + except RevisionNotFoundError as e: + raise DatasetNotFoundError( + f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." + ) from e except Exception as e1: # All the attempts failed, before raising the error we should check if the module is already cached try: diff --git a/src/datasets/utils/_dataset_viewer.py b/src/datasets/utils/_dataset_viewer.py index b8cf6ea49e1..092741a956c 100644 --- a/src/datasets/utils/_dataset_viewer.py +++ b/src/datasets/utils/_dataset_viewer.py @@ -23,7 +23,9 @@ class DatasetViewerError(DatasetsError): """ -def get_exported_parquet_files(dataset: str, revision: str, token: Optional[Union[str, bool]]) -> List[Dict[str, Any]]: +def get_exported_parquet_files( + dataset: str, commit_hash: str, token: Optional[Union[str, bool]] +) -> List[Dict[str, Any]]: """ Get the dataset exported parquet files Docs: https://huggingface.co/docs/datasets-server/parquet @@ -37,7 +39,7 @@ def get_exported_parquet_files(dataset: str, revision: str, token: Optional[Unio ) parquet_data_files_response.raise_for_status() if "X-Revision" in parquet_data_files_response.headers: - if parquet_data_files_response.headers["X-Revision"] == revision or revision is None: + if parquet_data_files_response.headers["X-Revision"] == commit_hash or commit_hash is None: parquet_data_files_response_json = parquet_data_files_response.json() if ( parquet_data_files_response_json.get("partial") is False @@ -50,7 +52,7 @@ def get_exported_parquet_files(dataset: str, revision: str, token: Optional[Unio logger.debug(f"Parquet export for {dataset} is not completely ready yet.") else: logger.debug( - f"Parquet export for {dataset} is available but outdated (revision='{parquet_data_files_response.headers['X-Revision']}')" + f"Parquet export for {dataset} is available but outdated (commit_hash='{parquet_data_files_response.headers['X-Revision']}')" ) except Exception as e: # noqa catch any exception of the dataset viewer API and consider the parquet export doesn't exist logger.debug(f"No parquet export for {dataset} available ({type(e).__name__}: {e})") @@ -58,7 +60,7 @@ def get_exported_parquet_files(dataset: str, revision: str, token: Optional[Unio def get_exported_dataset_infos( - dataset: str, revision: str, token: Optional[Union[str, bool]] + dataset: str, commit_hash: str, token: Optional[Union[str, bool]] ) -> Dict[str, Dict[str, Any]]: """ Get the dataset information, can be useful to get e.g. the dataset features. @@ -73,7 +75,7 @@ def get_exported_dataset_infos( ) info_response.raise_for_status() if "X-Revision" in info_response.headers: - if info_response.headers["X-Revision"] == revision or revision is None: + if info_response.headers["X-Revision"] == commit_hash or commit_hash is None: info_response = info_response.json() if ( info_response.get("partial") is False @@ -86,7 +88,7 @@ def get_exported_dataset_infos( logger.debug(f"Dataset info for {dataset} is not completely ready yet.") else: logger.debug( - f"Dataset info for {dataset} is available but outdated (revision='{info_response.headers['X-Revision']}')" + f"Dataset info for {dataset} is available but outdated (commit_hash='{info_response.headers['X-Revision']}')" ) except Exception as e: # noqa catch any exception of the dataset viewer API and consider the dataset info doesn't exist logger.debug(f"No dataset info for {dataset} available ({type(e).__name__}: {e})") diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 9bd2a1c3928..e44b1ce12bc 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -282,14 +282,10 @@ def get_authentication_headers_for_url(url: str, token: Optional[Union[str, bool return {} -class OfflineModeIsEnabled(ConnectionError): - pass - - def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None): """Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_HUB_OFFLINE is True.""" if config.HF_HUB_OFFLINE: - raise OfflineModeIsEnabled( + raise huggingface_hub.errors.OfflineModeIsEnabled( "Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg) ) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index fa463272213..21629407e4c 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -102,7 +102,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): @classmethod def _from_exported_parquet_files_and_dataset_infos( cls, - revision: str, + parquet_commit_hash: str, exported_parquet_files: List[Dict[str, Any]], dataset_infos: DatasetInfosDict, ) -> "MetadataConfigs": @@ -112,7 +112,7 @@ def _from_exported_parquet_files_and_dataset_infos( { "split": split_name, "path": [ - parquet_file["url"].replace("refs%2Fconvert%2Fparquet", revision) + parquet_file["url"].replace("refs%2Fconvert%2Fparquet", parquet_commit_hash) for parquet_file in parquet_files_for_split ], } diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index e82b5d11dc9..6f6ac01df9a 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -7,10 +7,10 @@ import zstandard as zstd from fsspec.registry import _registry as _fsspec_registry from fsspec.spec import AbstractBufferedFile, AbstractFileSystem +from huggingface_hub.errors import OfflineModeIsEnabled from datasets.download.download_config import DownloadConfig from datasets.utils.file_utils import ( - OfflineModeIsEnabled, _get_extraction_protocol, _prepare_single_hop_path_and_storage_options, cached_path, diff --git a/tests/test_load.py b/tests/test_load.py index 3595f9735ec..5d551e8afbe 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -90,8 +90,13 @@ def _generate_examples(self, filepath, **kwargs): SAMPLE_DATASET_IDENTIFIER3 = "hf-internal-testing/multi_dir_dataset" # has multiple data directories SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories SAMPLE_DATASET_IDENTIFIER5 = "hf-internal-testing/imagefolder_with_metadata_no_splits" # imagefolder with a metadata file and no default split names in data files -SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "hf-internal-testing/_dummy" -SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = "_dummy" + +SAMPLE_DATASET_COMMIT_HASH = "0e1cee81e718feadf49560b287c4eb669c2efb1a" +SAMPLE_DATASET_COMMIT_HASH2 = "c19550d35263090b1ec2bfefdbd737431fafec40" +SAMPLE_DATASET_COMMIT_HASH3 = "aaa2d4bdd1d877d1c6178562cfc584bdfa90f6dc" +SAMPLE_DATASET_COMMIT_HASH4 = "a7415617490f32e51c2f0ea20b5ce7cfba035a62" +SAMPLE_DATASET_COMMIT_HASH5 = "4971fa562942cab8263f56a448c3f831b18f1c27" + SAMPLE_DATASET_NO_CONFIGS_IN_METADATA = "hf-internal-testing/audiofolder_no_configs_in_metadata" SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_single_config_in_metadata" SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata" @@ -100,6 +105,15 @@ def _generate_examples(self, filepath, **kwargs): ) SAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME = "hf-internal-testing/DatasetWithCapitalLetters" +SAMPLE_DATASET_NO_CONFIGS_IN_METADATA_COMMIT_HASH = "26cd5079bb0d3cd1521c6894765a0b8edb159d7f" +SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA_COMMIT_HASH = "1668dfc91efae975e44457cdabef60fb9200820a" +SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_COMMIT_HASH = "e71bce498e6c2bd2c58b20b097fdd3389793263f" +SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT_COMMIT_HASH = "38937109bb4dc7067f575fe6e7b420158eb9cf32" +SAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME_COMMIT_HASH = "70aa36264a6954920a13dd0465156a60b9f8af4b" + +SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "hf-internal-testing/_dummy" +SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = "_dummy" + @pytest.fixture def data_dir(tmp_path): @@ -388,14 +402,16 @@ def setUp(self): def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self): factory = HubDatasetModuleFactoryWithScript( - "hf-internal-testing/dataset_with_script", + SAMPLE_DATASET_IDENTIFIER, + commit_hash=SAMPLE_DATASET_COMMIT_HASH, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path, ) with patch.object(config, "HF_DATASETS_TRUST_REMOTE_CODE", None): # this will be the default soon self.assertRaises(ValueError, factory.get_module) factory = HubDatasetModuleFactoryWithScript( - "hf-internal-testing/dataset_with_script", + SAMPLE_DATASET_IDENTIFIER, + commit_hash=SAMPLE_DATASET_COMMIT_HASH, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path, trust_remote_code=False, @@ -406,9 +422,9 @@ def test_HubDatasetModuleFactoryWithScript_with_hub_dataset(self): # "wmt_t2t" has additional imports (internal) factory = HubDatasetModuleFactoryWithScript( "wmt_t2t", + commit_hash="861aac88b2c6247dd93ade8b1c189ce714627750", download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path, - revision="861aac88b2c6247dd93ade8b1c189ce714627750", trust_remote_code=True, ) module_factory_result = factory.get_module() @@ -616,7 +632,7 @@ def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self): @pytest.mark.integration def test_HubDatasetModuleFactoryWithoutScript(self): factory = HubDatasetModuleFactoryWithoutScript( - SAMPLE_DATASET_IDENTIFIER2, download_config=self.download_config + SAMPLE_DATASET_IDENTIFIER2, commit_hash=SAMPLE_DATASET_COMMIT_HASH2, download_config=self.download_config ) module_factory_result = factory.get_module() assert importlib.import_module(module_factory_result.module_path) is not None @@ -626,7 +642,10 @@ def test_HubDatasetModuleFactoryWithoutScript(self): def test_HubDatasetModuleFactoryWithoutScript_with_data_dir(self): data_dir = "data2" factory = HubDatasetModuleFactoryWithoutScript( - SAMPLE_DATASET_IDENTIFIER3, data_dir=data_dir, download_config=self.download_config + SAMPLE_DATASET_IDENTIFIER3, + commit_hash=SAMPLE_DATASET_COMMIT_HASH3, + data_dir=data_dir, + download_config=self.download_config, ) module_factory_result = factory.get_module() assert importlib.import_module(module_factory_result.module_path) is not None @@ -645,7 +664,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_data_dir(self): @pytest.mark.integration def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self): factory = HubDatasetModuleFactoryWithoutScript( - SAMPLE_DATASET_IDENTIFIER4, download_config=self.download_config + SAMPLE_DATASET_IDENTIFIER4, commit_hash=SAMPLE_DATASET_COMMIT_HASH4, download_config=self.download_config ) module_factory_result = factory.get_module() assert importlib.import_module(module_factory_result.module_path) is not None @@ -660,7 +679,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self): assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["test"]) factory = HubDatasetModuleFactoryWithoutScript( - SAMPLE_DATASET_IDENTIFIER5, download_config=self.download_config + SAMPLE_DATASET_IDENTIFIER5, commit_hash=SAMPLE_DATASET_COMMIT_HASH5, download_config=self.download_config ) module_factory_result = factory.get_module() assert importlib.import_module(module_factory_result.module_path) is not None @@ -677,6 +696,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self): def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadata(self): factory = HubDatasetModuleFactoryWithoutScript( SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA, + commit_hash=SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA_COMMIT_HASH, download_config=self.download_config, ) module_factory_result = factory.get_module() @@ -714,9 +734,17 @@ def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadat @pytest.mark.integration def test_HubDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(self): - datasets_names = [SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT] - for dataset_name in datasets_names: - factory = HubDatasetModuleFactoryWithoutScript(dataset_name, download_config=self.download_config) + datasets_names = [ + (SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_COMMIT_HASH), + ( + SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT, + SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT_COMMIT_HASH, + ), + ] + for dataset_name, commit_hash in datasets_names: + factory = HubDatasetModuleFactoryWithoutScript( + dataset_name, commit_hash=commit_hash, download_config=self.download_config + ) module_factory_result = factory.get_module() assert importlib.import_module(module_factory_result.module_path) is not None @@ -767,6 +795,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(self) def test_HubDatasetModuleFactoryWithScript(self): factory = HubDatasetModuleFactoryWithScript( SAMPLE_DATASET_IDENTIFIER, + commit_hash=SAMPLE_DATASET_COMMIT_HASH, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path, trust_remote_code=True, @@ -779,6 +808,7 @@ def test_HubDatasetModuleFactoryWithScript(self): def test_HubDatasetModuleFactoryWithParquetExport(self): factory = HubDatasetModuleFactoryWithParquetExport( SAMPLE_DATASET_IDENTIFIER, + commit_hash=SAMPLE_DATASET_COMMIT_HASH, download_config=self.download_config, ) module_factory_result = factory.get_module() @@ -802,13 +832,13 @@ def test_HubDatasetModuleFactoryWithParquetExport_errors_on_wrong_sha(self): factory = HubDatasetModuleFactoryWithParquetExport( SAMPLE_DATASET_IDENTIFIER, download_config=self.download_config, - revision="0e1cee81e718feadf49560b287c4eb669c2efb1a", + commit_hash=SAMPLE_DATASET_COMMIT_HASH, ) factory.get_module() factory = HubDatasetModuleFactoryWithParquetExport( SAMPLE_DATASET_IDENTIFIER, download_config=self.download_config, - revision="wrong_sha", + commit_hash="wrong_sha", ) with self.assertRaises(_dataset_viewer.DatasetViewerError): factory.get_module() @@ -846,19 +876,22 @@ def test_CachedDatasetModuleFactory_with_script(self): @pytest.mark.parametrize( - "factory_class", + "factory_class,requires_commit_hash", [ - CachedDatasetModuleFactory, - HubDatasetModuleFactoryWithoutScript, - HubDatasetModuleFactoryWithScript, - LocalDatasetModuleFactoryWithoutScript, - LocalDatasetModuleFactoryWithScript, - PackagedDatasetModuleFactory, + (CachedDatasetModuleFactory, False), + (HubDatasetModuleFactoryWithoutScript, True), + (HubDatasetModuleFactoryWithScript, True), + (LocalDatasetModuleFactoryWithoutScript, False), + (LocalDatasetModuleFactoryWithScript, False), + (PackagedDatasetModuleFactory, False), ], ) -def test_module_factories(factory_class): +def test_module_factories(factory_class, requires_commit_hash): name = "dummy_name" - factory = factory_class(name) + if requires_commit_hash: + factory = factory_class(name, commit_hash="foo") + else: + factory = factory_class(name) assert factory.name == name