From b0090c509ceb10ed107ec4b9240fd90e89f5df39 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 8 Jul 2024 14:36:16 +0200 Subject: [PATCH 1/5] add zstd extension for zstd compressed files (along with zst) --- src/datasets/utils/file_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 2fb6ca20438..5cce2e0e199 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -751,6 +751,7 @@ def readline(f: io.RawIOBase): **{fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS}, # archive compression "zip": "zip", + "zstd": "zstd", } SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL = { fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS From 008f53842bb5f8606b15fdbdaa70f24c803b8a90 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 8 Jul 2024 14:42:39 +0200 Subject: [PATCH 2/5] fix typing --- src/datasets/filesystems/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index 5c48c7069ce..c627b1364f7 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -16,7 +16,7 @@ if _has_s3fs: from .s3filesystem import S3FileSystem -COMPRESSION_FILESYSTEMS: List[compression.BaseCompressedFileFileSystem] = [ +COMPRESSION_FILESYSTEMS: List[type[compression.BaseCompressedFileFileSystem]] = [ compression.Bz2FileSystem, compression.GzipFileSystem, compression.Lz4FileSystem, From 18e9c4e9d3159a4463035a0fa6ad88ae3859b5b1 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 8 Jul 2024 16:14:27 +0200 Subject: [PATCH 3/5] Revert "fix typing" This reverts commit 008f53842bb5f8606b15fdbdaa70f24c803b8a90. --- src/datasets/filesystems/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index c627b1364f7..5c48c7069ce 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -16,7 +16,7 @@ if _has_s3fs: from .s3filesystem import S3FileSystem -COMPRESSION_FILESYSTEMS: List[type[compression.BaseCompressedFileFileSystem]] = [ +COMPRESSION_FILESYSTEMS: List[compression.BaseCompressedFileFileSystem] = [ compression.Bz2FileSystem, compression.GzipFileSystem, compression.Lz4FileSystem, From 1072d9109e5c15e1b5c0481e5bdccb1cae7b760f Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 8 Jul 2024 16:20:37 +0200 Subject: [PATCH 4/5] add comment --- src/datasets/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 5cce2e0e199..e4f8a91976f 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -749,9 +749,9 @@ def readline(f: io.RawIOBase): COMPRESSION_EXTENSION_TO_PROTOCOL = { # single file compression **{fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS}, + "zstd": "zstd", # in addition to `zst`, see ZstdFileSystem.extension # archive compression "zip": "zip", - "zstd": "zstd", } SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL = { fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS From 5b8e38ab2b8266c2f7baab586fa06982297e266e Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 8 Jul 2024 16:27:53 +0200 Subject: [PATCH 5/5] add to SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL too --- src/datasets/utils/file_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index e4f8a91976f..8dd66081761 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -754,7 +754,8 @@ def readline(f: io.RawIOBase): "zip": "zip", } SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL = { - fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS + **{fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS}, + "zstd": "zstd", # in addition to `zst`, see ZstdFileSystem.extension } SINGLE_FILE_COMPRESSION_PROTOCOLS = {fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS} SINGLE_SLASH_AFTER_PROTOCOL_PATTERN = re.compile(r"(?