Skip to content

Commit

Permalink
Support JSON lines with missing struct fields (#7160)
Browse files Browse the repository at this point in the history
* Test cast_array_to_features with struct with missing fields

* Support cast_array_to_features with struct with missing fields
  • Loading branch information
albertvillanova authored Sep 23, 2024
1 parent 13f18e3 commit 2eb4edb
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2000,10 +2000,14 @@ def cast_array_to_feature(
sequence_kwargs = vars(feature).copy()
feature = sequence_kwargs.pop("feature")
feature = {name: Sequence(subfeature, **sequence_kwargs) for name, subfeature in feature.items()}
if isinstance(feature, dict) and {field.name for field in array.type} == set(feature):
if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):
if array.type.num_fields == 0:
return array
arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()]
null_array = pa.array([None] * len(array))
arrays = [
_c(array.field(name) if name in array_fields else null_array, subfeature)
for name, subfeature in feature.items()
]
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
# feature must be either [subfeature] or LargeList(subfeature) or Sequence(subfeature)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,14 @@ def test_cast_decimal_array_to_features():
cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False)


def test_cast_array_to_features_with_struct_with_missing_fields():
arr = pa.array([{"age": 25}, {"age": 63}])
feature = {"age": Value("int32"), "name": Value("string")}
cast_array = cast_array_to_feature(arr, feature)
assert cast_array.type == pa.struct({"age": pa.int32(), "name": pa.string()})
assert cast_array.to_pylist() == [{"age": 25, "name": None}, {"age": 63, "name": None}]


def test_cast_array_to_features_nested():
arr = pa.array([[{"foo": [0]}]])
assert cast_array_to_feature(arr, [{"foo": Sequence(Value("string"))}]).type == pa.list_(
Expand Down

0 comments on commit 2eb4edb

Please sign in to comment.