Skip to content

Commit

Permalink
add to webdataset
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Oct 16, 2024
1 parent 512a9a4 commit a57f51b
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions src/datasets/packaged_modules/webdataset/webdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class WebDataset(datasets.GeneratorBasedBuilder):
DEFAULT_WRITER_BATCH_SIZE = 100
IMAGE_EXTENSIONS: List[str] # definition at the bottom of the script
AUDIO_EXTENSIONS: List[str] # definition at the bottom of the script
VIDEO_EXTENSIONS: List[str] # definition at the bottom of the script
DECODERS: Dict[str, Callable[[Any], Any]] # definition at the bottom of the script
NUM_EXAMPLES_FOR_FEATURES_INFERENCE = 5

Expand Down Expand Up @@ -97,6 +98,11 @@ def _split_generators(self, dl_manager):
extension = field_name.rsplit(".", 1)[-1]
if extension in self.AUDIO_EXTENSIONS:
features[field_name] = datasets.Audio()
# Set Video types
for field_name in first_examples[0]:
extension = field_name.rsplit(".", 1)[-1]
if extension in self.VIDEO_EXTENSIONS:
features[field_name] = datasets.Video()
self.info.features = features

return splits
Expand Down Expand Up @@ -259,6 +265,17 @@ def base_plus_ext(path):
WebDataset.AUDIO_EXTENSIONS = AUDIO_EXTENSIONS


# TODO: initial list, we should check the compatibility of other formats
VIDEO_EXTENSIONS = [
".mkv",
".mp4",
".avi",
".mpeg",
".mov",
]
WebDataset.VIDEO_EXTENSIONS = VIDEO_EXTENSIONS


def text_loads(data: bytes):
return data.decode("utf-8")

Expand Down

0 comments on commit a57f51b

Please sign in to comment.