Skip to content

Commit

Permalink
dont re-encode featuers
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-hh committed Oct 10, 2024
1 parent b828575 commit f76701b
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2480,7 +2480,8 @@ def map(
ex_iterable = FormattedExamplesIterable(
ex_iterable,
formatting=copy.deepcopy(self._formatting),
features=features,
# only extract features if not already extracted by ex_iterable
features=None if ex_iterable.is_typed else self._info.features,
token_per_repo_id=self._token_per_repo_id,
)
ex_iterable = RebatchedArrowExamplesIterable(
Expand All @@ -2491,12 +2492,13 @@ def map(
ex_iterable = RebatchedArrowExamplesIterable(
self._ex_iterable, batch_size=batch_size if batched else 1, drop_last_batch=drop_last_batch
)
if self._formatting or features:
if self._formatting:
# apply formatting after iter_arrow to avoid re-encoding the examples
ex_iterable = FormattedExamplesIterable(
ex_iterable,
formatting=copy.deepcopy(self._formatting),
features=features,
# only extract features if not already extracted by ex_iterable
features=None if ex_iterable.is_typed else self._info.features,
token_per_repo_id=self._token_per_repo_id,
)

Expand Down Expand Up @@ -2585,7 +2587,7 @@ def filter(
ex_iterable = FormattedExamplesIterable(
ex_iterable,
formatting=copy.deepcopy(self._formatting),
features=self._info.features,
features=None if ex_iterable.is_typed else self._info.features,
token_per_repo_id=self._token_per_repo_id,
)

Expand Down

0 comments on commit f76701b

Please sign in to comment.