huggingface · albertvillanova · Jun 3, 2024 · May 16, 2024 · May 16, 2024 · May 21, 2024
diff --git a/setup.py b/setup.py
@@ -122,7 +122,7 @@
     # For performance gains with apache arrow
     "pandas",
     # for downloading datasets over HTTPS
-    "requests>=2.19.0",
+    "requests>=2.32.1",
     # progress bars in download and scripts
     "tqdm>=4.62.1",
     # for fast hashing
@@ -148,7 +148,7 @@
 ]
 
 VISION_REQUIRE = [
-    "Pillow>=6.2.1",
+    "Pillow>=9.4.0",  # When PIL.Image.ExifTags was introduced
 ]
 
 BENCHMARKS_REQUIRE = [
@@ -252,7 +252,7 @@
 
 setup(
     name="datasets",
-    version="2.19.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="2.19.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="HuggingFace community-driven open-source library of datasets",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.19.1"
+__version__ = "2.19.2"
 
 from .arrow_dataset import Dataset
 from .arrow_reader import ReadInstruction

diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 from huggingface_hub import HfApi, create_branch, get_repo_discussions
+from huggingface_hub.utils import HfHubHTTPError
 
 from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset
 from datasets.commands import BaseDatasetsCLICommand
@@ -89,7 +90,10 @@ def run(self) -> None:
             time.sleep(5)
         delete_files(dataset_id, revision=pr_revision, token=token)
         if not revision:
-            create_branch(dataset_id, branch="script", repo_type="dataset", token=token, exist_ok=True)
+            try:
+                create_branch(dataset_id, branch="script", repo_type="dataset", token=token, exist_ok=True)
+            except HfHubHTTPError:
+                pass
         print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
 
 

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -1235,7 +1235,12 @@ def get_module(self) -> DatasetModule:
             pass
         metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
         dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
-        if config.USE_PARQUET_EXPORT:  # maybe don't use the infos from the parquet export
+        # Use the infos from the parquet export except in some cases:
+        if self.data_dir or self.data_files or (self.revision and self.revision != "main"):
+            use_exported_dataset_infos = False
+        else:
+            use_exported_dataset_infos = True
+        if config.USE_PARQUET_EXPORT and use_exported_dataset_infos:
             try:
                 exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
                     dataset=self.name, revision=self.revision, token=self.download_config.token

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -1267,6 +1267,21 @@ def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir,
     assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "kwargs, expected_train_num_rows, expected_test_num_rows",
+    [
+        ({}, 2, 2),
+        ({"data_dir": "data1"}, 1, 1),  # GH-6918: NonMatchingSplitsSizesError
+        ({"data_files": "data1/train.txt"}, 1, None),  # GH-6939: ExpectedMoreSplits
+    ],
+)
+def test_load_dataset_without_script_from_hub(kwargs, expected_train_num_rows, expected_test_num_rows):
+    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3, **kwargs)
+    assert dataset["train"].num_rows == expected_train_num_rows
+    assert (dataset["test"].num_rows == expected_test_num_rows) if expected_test_num_rows else ("test" not in dataset)
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("stream_from_cache, ", [False, True])
 def test_load_dataset_cached_from_hub(stream_from_cache, caplog):