From 972b26e3ac11d2a766178d10a2ee6f2f2ceca690 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 25 Oct 2024 18:49:23 +0200 Subject: [PATCH] add video --- front/admin_ui/poetry.lock | 40 +++++++-- jobs/cache_maintenance/poetry.lock | 40 +++++++-- jobs/mongodb_migration/poetry.lock | 40 +++++++-- libs/libapi/poetry.lock | 40 +++++++-- libs/libapi/src/libapi/rows_utils.py | 6 +- libs/libapi/src/libapi/utils.py | 2 + libs/libcommon/poetry.lock | 39 +++++++-- libs/libcommon/pyproject.toml | 3 +- libs/libcommon/src/libcommon/constants.py | 1 + .../src/libcommon/viewer_utils/asset.py | 51 ++++++++++- .../src/libcommon/viewer_utils/features.py | 85 ++++++++++++++++++- .../src/libcommon/viewer_utils/rows.py | 4 + libs/libcommon/tests/test_url_preparator.py | 3 + .../tests/viewer_utils/test_features.py | 2 + .../libcommon/tests/viewer_utils/test_rows.py | 4 + services/admin/poetry.lock | 40 +++++++-- services/api/poetry.lock | 40 +++++++-- services/rows/poetry.lock | 40 +++++++-- services/search/poetry.lock | 40 +++++++-- services/search/src/search/routes/search.py | 3 + services/sse-api/poetry.lock | 40 +++++++-- services/webhook/poetry.lock | 40 +++++++-- services/worker/poetry.lock | 40 +++++++-- .../job_runners/config/parquet_and_info.py | 17 +++- .../worker/job_runners/split/first_rows.py | 6 ++ 25 files changed, 552 insertions(+), 114 deletions(-) diff --git a/front/admin_ui/poetry.lock b/front/admin_ui/poetry.lock index f3a4b25071..d8b4f856dc 100644 --- a/front/admin_ui/poetry.lock +++ b/front/admin_ui/poetry.lock @@ -624,14 +624,12 @@ files = [ [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -656,18 +654,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -679,6 +683,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "dill" version = "0.3.6" @@ -1449,7 +1470,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/jobs/cache_maintenance/poetry.lock b/jobs/cache_maintenance/poetry.lock index d8a0792246..3641112569 100644 --- a/jobs/cache_maintenance/poetry.lock +++ b/jobs/cache_maintenance/poetry.lock @@ -557,14 +557,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -589,18 +587,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -612,6 +616,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1076,7 +1097,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/jobs/mongodb_migration/poetry.lock b/jobs/mongodb_migration/poetry.lock index 4f90060947..fbbfc48a63 100644 --- a/jobs/mongodb_migration/poetry.lock +++ b/jobs/mongodb_migration/poetry.lock @@ -557,14 +557,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -589,18 +587,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -612,6 +616,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1076,7 +1097,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/libs/libapi/poetry.lock b/libs/libapi/poetry.lock index cebe1b5bd8..677d23f170 100644 --- a/libs/libapi/poetry.lock +++ b/libs/libapi/poetry.lock @@ -564,14 +564,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -596,18 +594,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -619,6 +623,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1144,7 +1165,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/libs/libapi/src/libapi/rows_utils.py b/libs/libapi/src/libapi/rows_utils.py index 9741b9c86c..52e07ed356 100644 --- a/libs/libapi/src/libapi/rows_utils.py +++ b/libs/libapi/src/libapi/rows_utils.py @@ -21,6 +21,7 @@ def _transform_row( split: str, features: Features, storage_client: StorageClient, + hf_endpoint: str, offset: int, row_idx_column: Optional[str], ) -> Row: @@ -36,6 +37,7 @@ def _transform_row( featureName=featureName, fieldType=fieldType, storage_client=storage_client, + hf_endpoint=hf_endpoint, ) for (featureName, fieldType) in features.items() } @@ -52,6 +54,7 @@ async def transform_rows( rows: list[Row], features: Features, storage_client: StorageClient, + hf_endpoint: str, offset: int, row_idx_column: Optional[str], ) -> list[Row]: @@ -63,10 +66,11 @@ async def transform_rows( split=split, features=features, storage_client=storage_client, + hf_endpoint=hf_endpoint, offset=offset, row_idx_column=row_idx_column, ) - if "Audio(" in str(features) or "Image(" in str(features): + if "Audio(" in str(features) or "Image(" in str(features) or "Video(" in str(features): # Use multithreading to parallelize image/audio files uploads. # Also multithreading is ok to convert audio data # (we use pydub which might spawn one ffmpeg process per conversion, which releases the GIL) diff --git a/libs/libapi/src/libapi/utils.py b/libs/libapi/src/libapi/utils.py index 17fe6068c4..7fa2b601e3 100644 --- a/libs/libapi/src/libapi/utils.py +++ b/libs/libapi/src/libapi/utils.py @@ -207,6 +207,7 @@ async def to_rows_list( features: Features, unsupported_columns: list[str], storage_client: StorageClient, + hf_endpoint: str, row_idx_column: Optional[str] = None, truncated_columns: Optional[list[str]] = None, ) -> list[RowItem]: @@ -224,6 +225,7 @@ async def to_rows_list( rows=pa_table.to_pylist(), features=features, storage_client=storage_client, + hf_endpoint=hf_endpoint, offset=offset, row_idx_column=row_idx_column, ) diff --git a/libs/libcommon/poetry.lock b/libs/libcommon/poetry.lock index 69fa679ebb..ae19d0907e 100644 --- a/libs/libcommon/poetry.lock +++ b/libs/libcommon/poetry.lock @@ -593,14 +593,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -625,18 +623,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -648,6 +652,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -3938,4 +3959,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "3.9.18" -content-hash = "fff930d529a2b90abb4f360acb225964e8bf562640b8ecfe77ec8c89856e4562" +content-hash = "a314d363151d99a193ebad59fc2b73fcf5b6a85bf5e1724303f8e2539efd1248" diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml index 3eba95c06e..771dae9bd4 100644 --- a/libs/libcommon/pyproject.toml +++ b/libs/libcommon/pyproject.toml @@ -9,7 +9,8 @@ license = "Apache-2.0" python = "3.9.18" appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py index 4af3a0607f..e12187ac75 100644 --- a/libs/libcommon/src/libcommon/constants.py +++ b/libs/libcommon/src/libcommon/constants.py @@ -33,6 +33,7 @@ PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = 100 +PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = 10 PARQUET_REVISION = "refs/convert/parquet" TAG_NFAA_CONTENT = "not-for-all-audiences" diff --git a/libs/libcommon/src/libcommon/viewer_utils/asset.py b/libs/libcommon/src/libcommon/viewer_utils/asset.py index 6a6f4fb5b0..c9004b9bf1 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/asset.py +++ b/libs/libcommon/src/libcommon/viewer_utils/asset.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2022 The HuggingFace Authors. +import re from io import BytesIO from tempfile import NamedTemporaryFile -from typing import TYPE_CHECKING, Optional, TypedDict +from typing import TYPE_CHECKING, Any, Optional, TypedDict from PIL import Image, ImageOps from pydub import AudioSegment # type:ignore @@ -28,6 +29,10 @@ class AudioSource(TypedDict): type: str +class VideoSource(TypedDict): + src: str + + def create_image_file( dataset: str, revision: str, @@ -125,3 +130,47 @@ def replace_dataset_git_revision_placeholder(url_or_object_key: str, revision: s # Before: https://datasets-server.huggingface.co/assets/vidore/syntheticDocQA_artificial_intelligence_test/--/{dataset_git_revision}/--/default/test/0/image/image.jpg # After: https://datasets-server.huggingface.co/assets/vidore/syntheticDocQA_artificial_intelligence_test/--/c844916c2920d2d01e8a15f8dc1caf6f017a293c/--/default/test/0/image/image.jpg return url_or_object_key.replace(DATASET_GIT_REVISION_PLACEHOLDER, revision) + + +def create_video_file( + dataset: str, + revision: str, + config: str, + split: str, + row_idx: int, + column: str, + filename: str, + encoded_video: dict[str, Any], + storage_client: "StorageClient", + hf_endpoint: str, +) -> VideoSource: + # We use a placeholder revision in the JSON stored in the database, + # while the path of the file stored on the disk/s3 contains the revision. + # The placeholder will be replaced later by the + # dataset_git_revision of cache responses when the data will be accessed. + # This is useful to allow moving files to a newer revision without having + # to modify the cached rows content. + if ( + "path" in encoded_video + and isinstance(encoded_video["path"], str) + and encoded_video["path"].startswith(f"hf://datasets/{dataset}@") + ): + src = re.sub("@\w+", "/resolve/" + DATASET_GIT_REVISION_PLACEHOLDER, encoded_video["path"]).replace( + "hf://", hf_endpoint + ) + else: + object_key = storage_client.generate_object_key( + dataset=dataset, + revision=DATASET_GIT_REVISION_PLACEHOLDER, + config=config, + split=split, + row_idx=row_idx, + column=column, + filename=filename, + ) + path = replace_dataset_git_revision_placeholder(object_key, revision=revision) + if storage_client.overwrite or not storage_client.exists(path): + with storage_client._fs.open(storage_client.get_full_path(path), "wb") as f: + f.write(encoded_video["bytes"]) + src = storage_client.get_url(object_key, revision=revision) + return VideoSource(src=src) diff --git a/libs/libcommon/src/libcommon/viewer_utils/features.py b/libs/libcommon/src/libcommon/viewer_utils/features.py index 47ad9fa736..6cefba6943 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/features.py +++ b/libs/libcommon/src/libcommon/viewer_utils/features.py @@ -22,13 +22,20 @@ Translation, TranslationVariableLanguages, Value, + Video, ) from datasets.features.features import FeatureType, _visit +from decord import VideoReader # type: ignore from PIL import Image as PILImage from libcommon.dtos import FeatureItem from libcommon.storage_client import StorageClient -from libcommon.viewer_utils.asset import SUPPORTED_AUDIO_EXTENSIONS, create_audio_file, create_image_file +from libcommon.viewer_utils.asset import ( + SUPPORTED_AUDIO_EXTENSIONS, + create_audio_file, + create_image_file, + create_video_file, +) UNSUPPORTED_FEATURES = [Value("binary")] AUDIO_FILE_MAGIC_NUMBERS: dict[str, Any] = { @@ -196,6 +203,64 @@ def infer_audio_file_extension(audio_file_bytes: bytes) -> Optional[str]: return None +def video( + dataset: str, + revision: str, + config: str, + split: str, + row_idx: int, + value: Any, + featureName: str, + storage_client: StorageClient, + hf_endpoint: str, + json_path: Optional[list[Union[str, int]]] = None, +) -> Any: + if value is None: + return None + if isinstance(value, VideoReader) and hasattr(value, "_hf_encoded") and isinstance(value._hf_encoded, dict): + value = value._hf_encoded # `datasets` patches `decord` to store the encoded data here + elif isinstance(value, dict): + value = {"path": value.get("path"), "bytes": value["bytes"]} + elif isinstance(value, bytes): + value = {"path": None, "bytes": value} + elif isinstance(value, str): + value = {"path": value, "bytes": None} + + if not (isinstance(value, dict) and "path" in value and "bytes" in value): + raise TypeError( + "Video cell must be an encoded dict of a video, " + f"but got {str(value)[:300]}{'...' if len(str(value)) > 300 else ''}" + ) + + video_file_extension = get_video_file_extension(value) + return create_video_file( + dataset=dataset, + revision=revision, + config=config, + split=split, + row_idx=row_idx, + column=featureName, + filename=f"{append_hash_suffix('video', json_path)}{video_file_extension}", + encoded_video=value, + storage_client=storage_client, + hf_endpoint=hf_endpoint, + ) + + +def get_video_file_extension(value: Any) -> str: + if "path" in value and isinstance(value["path"], str): + # .split("::")[0] for chained URLs like zip://audio.wav::https://foo.bar/data.zip + video_file_extension = os.path.splitext(value["path"].split("::")[0])[1] + if not video_file_extension: + raise ValueError( + "A video sample should have a 'path' with a valid file name nd extension, but got" + f" {', '.join(value['path'])}." + ) + else: + raise ValueError("A video sample should have 'path' and 'bytes' but got" f" {', '.join(value)}.") + return video_file_extension + + def get_cell_value( dataset: str, revision: str, @@ -206,6 +271,7 @@ def get_cell_value( featureName: str, fieldType: Any, storage_client: StorageClient, + hf_endpoint: str, json_path: Optional[list[Union[str, int]]] = None, ) -> Any: # always allow None values in the cells @@ -235,6 +301,19 @@ def get_cell_value( storage_client=storage_client, json_path=json_path, ) + elif isinstance(fieldType, Video): + return video( + dataset=dataset, + revision=revision, + config=config, + split=split, + row_idx=row_idx, + value=cell, + featureName=featureName, + storage_client=storage_client, + hf_endpoint=hf_endpoint, + json_path=json_path, + ) elif isinstance(fieldType, list): if not isinstance(cell, list): raise TypeError("list cell must be a list.") @@ -252,6 +331,7 @@ def get_cell_value( featureName=featureName, fieldType=subFieldType, storage_client=storage_client, + hf_endpoint=hf_endpoint, json_path=json_path + [idx] if json_path else [idx], ) for (idx, subCell) in enumerate(cell) @@ -271,6 +351,7 @@ def get_cell_value( featureName=featureName, fieldType=fieldType.feature, storage_client=storage_client, + hf_endpoint=hf_endpoint, json_path=json_path + [idx] if json_path else [idx], ) for (idx, subCell) in enumerate(cell) @@ -293,6 +374,7 @@ def get_cell_value( featureName=featureName, fieldType=fieldType.feature[key], storage_client=storage_client, + hf_endpoint=hf_endpoint, json_path=json_path + [key, idx] if json_path else [key, idx], ) for (idx, subCellItem) in enumerate(subCell) @@ -315,6 +397,7 @@ def get_cell_value( featureName=featureName, fieldType=fieldType[key], storage_client=storage_client, + hf_endpoint=hf_endpoint, json_path=json_path + [key] if json_path else [key], ) for (key, subCell) in cell.items() diff --git a/libs/libcommon/src/libcommon/viewer_utils/rows.py b/libs/libcommon/src/libcommon/viewer_utils/rows.py index 2c09d1ef37..4509c442d4 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/rows.py +++ b/libs/libcommon/src/libcommon/viewer_utils/rows.py @@ -28,6 +28,7 @@ def transform_rows( rows: list[Row], features: Features, storage_client: StorageClient, + hf_endpoint: str, ) -> list[Row]: return [ { @@ -41,6 +42,7 @@ def transform_rows( featureName=featureName, fieldType=fieldType, storage_client=storage_client, + hf_endpoint=hf_endpoint, ) for (featureName, fieldType) in features.items() } @@ -58,6 +60,7 @@ def create_first_rows_response( config: str, split: str, storage_client: StorageClient, + hf_endpoint: str, features: Features, get_rows_content: GetRowsContent, min_cell_bytes: int, @@ -149,6 +152,7 @@ def create_first_rows_response( rows=rows_content.rows, features=features, storage_client=storage_client, + hf_endpoint=hf_endpoint, ) except Exception as err: raise RowsPostProcessingError( diff --git a/libs/libcommon/tests/test_url_preparator.py b/libs/libcommon/tests/test_url_preparator.py index 46493498d5..29ad4425f2 100644 --- a/libs/libcommon/tests/test_url_preparator.py +++ b/libs/libcommon/tests/test_url_preparator.py @@ -91,6 +91,7 @@ def get_fake_rows_content(rows_max_number: int) -> RowsContent: # noqa: ARG001 config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client, + hf_endpoint="", features=dataset.features, get_rows_content=get_fake_rows_content, min_cell_bytes=0, @@ -120,6 +121,7 @@ def test_prepare_urls_in_first_rows_in_place( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=0, @@ -175,6 +177,7 @@ def test_prepare_urls_in_first_rows_in_place_with_truncated_cells( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=DEFAULT_MIN_CELL_BYTES, diff --git a/libs/libcommon/tests/viewer_utils/test_features.py b/libs/libcommon/tests/viewer_utils/test_features.py index ab5d3a8a85..09e16c7b73 100644 --- a/libs/libcommon/tests/viewer_utils/test_features.py +++ b/libs/libcommon/tests/viewer_utils/test_features.py @@ -78,6 +78,7 @@ def test_get_cell_value_value( featureName=DEFAULT_COLUMN_NAME, fieldType=feature, storage_client=storage_client_with_url_preparator, + hf_endpoint="", ) assert value == expected_cell assert_output_has_valid_files(expected_cell, storage_client=storage_client_with_url_preparator) @@ -184,6 +185,7 @@ async def convert_to_response_dict(http_response, operation_model): # type: ign featureName=DEFAULT_COLUMN_NAME, fieldType=feature, storage_client=storage_client, + hf_endpoint="", ) assert value == dataset_fixture.expected_cell diff --git a/libs/libcommon/tests/viewer_utils/test_rows.py b/libs/libcommon/tests/viewer_utils/test_rows.py index 105b73e80a..1162cc760e 100644 --- a/libs/libcommon/tests/viewer_utils/test_rows.py +++ b/libs/libcommon/tests/viewer_utils/test_rows.py @@ -45,6 +45,7 @@ def test_create_first_rows_response( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client_with_url_preparator, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=DEFAULT_MIN_CELL_BYTES, @@ -90,6 +91,7 @@ def test_create_first_rows_response_truncated( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=DEFAULT_MIN_CELL_BYTES, @@ -154,6 +156,7 @@ def test_create_first_rows_response_truncation_on_audio_or_image( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client_with_url_preparator, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=DEFAULT_MIN_CELL_BYTES, @@ -170,6 +173,7 @@ def test_create_first_rows_response_truncation_on_audio_or_image( config=DEFAULT_CONFIG, split=DEFAULT_SPLIT, storage_client=storage_client_with_url_preparator, + hf_endpoint="", features=dataset.features, get_rows_content=get_dataset_rows_content(dataset=dataset), min_cell_bytes=DEFAULT_MIN_CELL_BYTES, diff --git a/services/admin/poetry.lock b/services/admin/poetry.lock index 94548f503f..6b5299ba4f 100644 --- a/services/admin/poetry.lock +++ b/services/admin/poetry.lock @@ -571,14 +571,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -603,18 +601,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -626,6 +630,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1167,7 +1188,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/api/poetry.lock b/services/api/poetry.lock index 848d07e681..cf5e7ae71f 100644 --- a/services/api/poetry.lock +++ b/services/api/poetry.lock @@ -571,14 +571,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -603,18 +601,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -626,6 +630,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1186,7 +1207,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/rows/poetry.lock b/services/rows/poetry.lock index 060be8c996..c6e5cf2094 100644 --- a/services/rows/poetry.lock +++ b/services/rows/poetry.lock @@ -590,14 +590,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -622,18 +620,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -645,6 +649,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1205,7 +1226,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/search/poetry.lock b/services/search/poetry.lock index 4bd2b3aa76..db6bdaecfa 100644 --- a/services/search/poetry.lock +++ b/services/search/poetry.lock @@ -571,14 +571,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -603,18 +601,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -626,6 +630,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1245,7 +1266,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/search/src/search/routes/search.py b/services/search/src/search/routes/search.py index 9d4f48dbef..6feedff53a 100644 --- a/services/search/src/search/routes/search.py +++ b/services/search/src/search/routes/search.py @@ -83,6 +83,7 @@ async def create_response( config: str, split: str, storage_client: StorageClient, + hf_endpoint: str, offset: int, features: Features, unsupported_columns: list[str], @@ -103,6 +104,7 @@ async def create_response( config=config, split=split, storage_client=storage_client, + hf_endpoint=hf_endpoint, offset=offset, features=features, unsupported_columns=unsupported_columns, @@ -234,6 +236,7 @@ async def search_endpoint(request: Request) -> Response: config=config, split=split, storage_client=cached_assets_storage_client, + hf_endpoint=hf_endpoint, offset=offset, features=features or Features.from_arrow_schema(pa_table.schema), unsupported_columns=unsupported_columns, diff --git a/services/sse-api/poetry.lock b/services/sse-api/poetry.lock index f87c24cca3..af08d47195 100644 --- a/services/sse-api/poetry.lock +++ b/services/sse-api/poetry.lock @@ -571,14 +571,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -603,18 +601,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -626,6 +630,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1215,7 +1236,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/webhook/poetry.lock b/services/webhook/poetry.lock index 72ca180b17..37859c89ba 100644 --- a/services/webhook/poetry.lock +++ b/services/webhook/poetry.lock @@ -571,14 +571,12 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -603,18 +601,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -626,6 +630,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1186,7 +1207,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index 2c756b05d5..82d8d5e34e 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -879,14 +879,12 @@ files = [ [[package]] name = "datasets" -version = "3.0.2" +version = "3.0.3.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" -files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, -] +files = [] +develop = false [package.dependencies] aiohttp = "*" @@ -911,18 +909,24 @@ xxhash = "*" [package.extras] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets.git" +reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" +resolved_reference = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9" + [[package]] name = "decorator" version = "5.1.1" @@ -934,6 +938,23 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "decord" +version = "0.6.0" +description = "Decord Video Loader" +optional = false +python-versions = "*" +files = [ + {file = "decord-0.6.0-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323"}, + {file = "decord-0.6.0-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a"}, + {file = "decord-0.6.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8"}, + {file = "decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976"}, + {file = "decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad"}, +] + +[package.dependencies] +numpy = ">=1.14.0" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1646,7 +1667,8 @@ develop = true [package.dependencies] appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = {version = "3.0.2", extras = ["audio", "vision"]} +datasets = {git = "https://github.com/huggingface/datasets.git", rev = "65f6eb54aa0e8bb44cea35deea28e0e8fecc25b9", extras = ["audio", "vision"]} +decord = "0.6.0" environs = "^9.5.0" fsspec = {version = "2024.3.1", extras = ["s3"]} huggingface-hub = {version = "^0.25.1", extras = ["hf-transfer"]} diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 3a57cc850a..f7ebe29296 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -55,6 +55,7 @@ PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS, PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_BINARY_DATASETS, PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS, + PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS, ) from libcommon.dtos import JobInfo, SplitHubFile from libcommon.exceptions import ( @@ -352,7 +353,9 @@ def get_writer_batch_size_from_info(ds_config_info: datasets.info.DatasetInfo) - Writer batch size to pass to a dataset builder. If `None`, then it will use the `datasets` default. """ - if ds_config_info.builder_name == "audiofolder" or "Audio(" in str(ds_config_info.features): + if ds_config_info.builder_name == "videofolder" or "Video(" in str(ds_config_info.features): + return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS + elif ds_config_info.builder_name == "audiofolder" or "Audio(" in str(ds_config_info.features): return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS elif ds_config_info.builder_name == "imagefolder" or "Image(" in str(ds_config_info.features): return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS @@ -363,7 +366,7 @@ def get_writer_batch_size_from_info(ds_config_info: datasets.info.DatasetInfo) - def get_writer_batch_size_from_row_group_size( - num_rows: int, row_group_byte_size: int, max_row_group_byte_size: int, factor_of: int = 100, divide_step: int = 10 + num_rows: int, row_group_byte_size: int, max_row_group_byte_size: int, factor_of: int, divide_step: int ) -> int: """ Get the writer_batch_size that defines the maximum row group size in the parquet files, @@ -1367,7 +1370,17 @@ def compute_config_parquet_and_info_response( num_rows=err.num_rows, row_group_byte_size=err.row_group_byte_size, max_row_group_byte_size=max_row_group_byte_size_for_copy, + factor_of=100, + divide_step=10, ) + if writer_batch_size / err.num_rows * err.row_group_byte_size > max_row_group_byte_size_for_copy: + return get_writer_batch_size_from_row_group_size( + num_rows=err.num_rows, + row_group_byte_size=err.row_group_byte_size, + max_row_group_byte_size=max_row_group_byte_size_for_copy, + factor_of=10, + divide_step=2, + ) parquet_operations, partial, estimated_dataset_info = stream_convert_to_parquet( builder, max_dataset_size_bytes=max_dataset_size_bytes, diff --git a/services/worker/src/worker/job_runners/split/first_rows.py b/services/worker/src/worker/job_runners/split/first_rows.py index d6bf430dae..b8f3e2115d 100644 --- a/services/worker/src/worker/job_runners/split/first_rows.py +++ b/services/worker/src/worker/job_runners/split/first_rows.py @@ -37,6 +37,7 @@ def compute_first_rows_from_parquet_response( config: str, split: str, storage_client: StorageClient, + hf_endpoint: str, min_cell_bytes: int, rows_max_bytes: int, rows_max_number: int, @@ -124,6 +125,7 @@ def get_rows_content(rows_max_number: int) -> RowsContent: config=config, split=split, storage_client=storage_client, + hf_endpoint=hf_endpoint, features=features, get_rows_content=get_rows_content, min_cell_bytes=min_cell_bytes, @@ -140,6 +142,7 @@ def compute_first_rows_from_streaming_response( config: str, split: str, storage_client: StorageClient, + hf_endpoint: str, hf_token: Optional[str], min_cell_bytes: int, rows_max_bytes: int, @@ -265,6 +268,7 @@ def get_rows_content(rows_max_number: int) -> RowsContent: config=config, split=split, storage_client=storage_client, + hf_endpoint=hf_endpoint, features=features, get_rows_content=get_rows_content, min_cell_bytes=min_cell_bytes, @@ -317,6 +321,7 @@ def compute(self) -> CompleteJobResult: config=self.config, split=self.split, storage_client=self.storage_client, + hf_endpoint=self.app_config.common.hf_endpoint, min_cell_bytes=self.first_rows_config.min_cell_bytes, rows_max_bytes=self.first_rows_config.max_bytes, rows_min_number=self.first_rows_config.min_number, @@ -343,6 +348,7 @@ def compute(self) -> CompleteJobResult: config=self.config, split=self.split, storage_client=self.storage_client, + hf_endpoint=self.app_config.common.hf_endpoint, hf_token=self.app_config.common.hf_token, min_cell_bytes=self.first_rows_config.min_cell_bytes, rows_max_bytes=self.first_rows_config.max_bytes,