Skip to content

Commit

Permalink
no need for dataset_info
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Oct 17, 2024
1 parent d4422cc commit d07730a
Showing 1 changed file with 25 additions and 22 deletions.
47 changes: 25 additions & 22 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError, get_session

from . import config
from . import __version__, config
from .arrow_dataset import Dataset
from .builder import BuilderConfig, DatasetBuilder
from .data_files import (
Expand Down Expand Up @@ -989,34 +989,37 @@ def __init__(
increase_load_count(name)

def get_module(self) -> DatasetModule:
hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
self.name,
revision=self.revision,
# Get the Dataset Card and fix the revision in case there are new commits in the meantime
api = HfApi(
endpoint=config.HF_ENDPOINT,
token=self.download_config.token,
timeout=100.0,
library_name="datasets",
library_version=__version__,
user_agent=get_datasets_user_agent(self.download_config.user_agent),
)
# even if metadata_configs is not None (which means that we will resolve files for each config later)
# we cannot skip resolving all files because we need to infer module name by files extensions
revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip("/")

download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading readme"
try:
dataset_readme_path = cached_path(
hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision),
download_config=download_config,
dataset_readme_path = api.hf_hub_download(
repo_id=self.name,
filename=config.REPOCARD_FILENAME,
repo_type="dataset",
revision=self.revision,
proxies=self.download_config.proxies,
)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
commit_hash = os.path.dirname(dataset_readme_path)
dataset_card_data = DatasetCard.load(dataset_readme_path).data
except FileNotFoundError:
commit_hash = api.dataset_info(
self.name,
revision=self.revision,
timeout=100.0,
).sha
dataset_card_data = DatasetCardData()
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading standalone yaml"
try:
standalone_yaml_path = cached_path(
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=commit_hash),
download_config=download_config,
)
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
Expand All @@ -1027,6 +1030,7 @@ def get_module(self) -> DatasetModule:
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
except FileNotFoundError:
pass
base_path = f"hf://datasets/{self.name}@{commit_hash}/{self.data_dir or ''}".rstrip("/")
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# Use the infos from the parquet export except in some cases:
Expand Down Expand Up @@ -1110,7 +1114,7 @@ def get_module(self) -> DatasetModule:
]
default_config_name = None
builder_kwargs = {
"base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=commit_hash).rstrip("/"),
"repo_id": self.name,
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
}
Expand All @@ -1122,7 +1126,7 @@ def get_module(self) -> DatasetModule:
try:
# this file is deprecated and was created automatically in old versions of push_to_hub
dataset_infos_path = cached_path(
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=commit_hash),
download_config=download_config,
)
with open(dataset_infos_path, encoding="utf-8") as f:
Expand All @@ -1143,10 +1147,9 @@ def get_module(self) -> DatasetModule:
if default_config_name is None and len(dataset_infos) == 1:
default_config_name = next(iter(dataset_infos))

hash = revision
return DatasetModule(
module_path,
hash,
commit_hash,
builder_kwargs,
dataset_infos=dataset_infos,
builder_configs_parameters=BuilderConfigsParameters(
Expand Down

0 comments on commit d07730a

Please sign in to comment.