Skip to content

Commit

Permalink
feat: Support for Arrow PyCapsule interface (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron authored Aug 13, 2024
1 parent f5220dc commit 28d46ce
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 3 deletions.
7 changes: 6 additions & 1 deletion src/quak/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""An anywidget for data that talks like a duck."""

from ._util import has_pycapsule_stream_interface
from ._version import __version__
from ._widget import Widget

Expand All @@ -18,7 +19,11 @@ def format(self, obj, include=None, exclude=None):
# special case for duckdb relations
if isinstance(obj, duckdb.DuckDBPyRelation):
obj = obj.arrow()
if is_arrow_ipc(obj) or is_dataframe_api_obj(obj):
if (
has_pycapsule_stream_interface(obj)
or is_arrow_ipc(obj)
or is_dataframe_api_obj(obj)
):
obj = Widget(obj)
return super().format(obj, include, exclude)

Expand Down
10 changes: 10 additions & 0 deletions src/quak/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
import pyarrow as pa


def has_pycapsule_stream_interface(obj: object) -> bool:
"""
Check if an object implements the Arrow C Stream Arrow via the PyCapsule Interface.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
has an Arrow C Stream
"""
return hasattr(obj, "__arrow_c_stream__")


def is_dataframe_api_obj(obj: object) -> DataFrameObject:
"""Check if an object has a dataframe API."""
method = getattr(obj, "__dataframe__", None)
Expand Down
19 changes: 17 additions & 2 deletions src/quak/_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@

import anywidget
import duckdb
import pyarrow as pa
import traitlets

from ._util import (
arrow_table_from_dataframe_protocol,
arrow_table_from_ipc,
get_columns,
has_pycapsule_stream_interface,
is_arrow_ipc,
is_dataframe_api_obj,
table_to_ipc,
)

Expand All @@ -37,10 +40,22 @@ def __init__(self, data, *, table: str = "df"):
conn = data
else:
conn = duckdb.connect(":memory:")
if is_arrow_ipc(data):
if has_pycapsule_stream_interface(data):
# NOTE: for now we materialize the input into an in-memory Arrow table,
# so that we can perform repeated queries on that. In the future, it may
# be better to keep this Arrow stream non-materalized in Python and
# create a new DuckDB table from the stream.
# arrow_table = pa.RecordBatchReader.from_stream(data)
arrow_table = pa.table(data)
elif is_arrow_ipc(data):
arrow_table = arrow_table_from_ipc(data)
else:
elif is_dataframe_api_obj(data):
arrow_table = arrow_table_from_dataframe_protocol(data)
else:
raise ValueError(
"input must be a DuckDB connection, DataFrame-like, an Arrow IPC "
"table, or an Arrow object exporting the Arrow C Stream interface."
)
conn.register(table, arrow_table)
self._conn = conn
super().__init__(
Expand Down

0 comments on commit 28d46ce

Please sign in to comment.