From 28d46ce889e26032f8a78a495dfee35109a59fd4 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 13 Aug 2024 13:41:34 -0400 Subject: [PATCH] feat: Support for Arrow PyCapsule interface (#23) --- src/quak/__init__.py | 7 ++++++- src/quak/_util.py | 10 ++++++++++ src/quak/_widget.py | 19 +++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/quak/__init__.py b/src/quak/__init__.py index bfccfcc..8879390 100644 --- a/src/quak/__init__.py +++ b/src/quak/__init__.py @@ -1,5 +1,6 @@ """An anywidget for data that talks like a duck.""" +from ._util import has_pycapsule_stream_interface from ._version import __version__ from ._widget import Widget @@ -18,7 +19,11 @@ def format(self, obj, include=None, exclude=None): # special case for duckdb relations if isinstance(obj, duckdb.DuckDBPyRelation): obj = obj.arrow() - if is_arrow_ipc(obj) or is_dataframe_api_obj(obj): + if ( + has_pycapsule_stream_interface(obj) + or is_arrow_ipc(obj) + or is_dataframe_api_obj(obj) + ): obj = Widget(obj) return super().format(obj, include, exclude) diff --git a/src/quak/_util.py b/src/quak/_util.py index b04b54d..94aa9fe 100644 --- a/src/quak/_util.py +++ b/src/quak/_util.py @@ -9,6 +9,16 @@ import pyarrow as pa +def has_pycapsule_stream_interface(obj: object) -> bool: + """ + Check if an object implements the Arrow C Stream Arrow via the PyCapsule Interface. + + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + has an Arrow C Stream + """ + return hasattr(obj, "__arrow_c_stream__") + + def is_dataframe_api_obj(obj: object) -> DataFrameObject: """Check if an object has a dataframe API.""" method = getattr(obj, "__dataframe__", None) diff --git a/src/quak/_widget.py b/src/quak/_widget.py index 91bc120..808a8f5 100644 --- a/src/quak/_widget.py +++ b/src/quak/_widget.py @@ -6,13 +6,16 @@ import anywidget import duckdb +import pyarrow as pa import traitlets from ._util import ( arrow_table_from_dataframe_protocol, arrow_table_from_ipc, get_columns, + has_pycapsule_stream_interface, is_arrow_ipc, + is_dataframe_api_obj, table_to_ipc, ) @@ -37,10 +40,22 @@ def __init__(self, data, *, table: str = "df"): conn = data else: conn = duckdb.connect(":memory:") - if is_arrow_ipc(data): + if has_pycapsule_stream_interface(data): + # NOTE: for now we materialize the input into an in-memory Arrow table, + # so that we can perform repeated queries on that. In the future, it may + # be better to keep this Arrow stream non-materalized in Python and + # create a new DuckDB table from the stream. + # arrow_table = pa.RecordBatchReader.from_stream(data) + arrow_table = pa.table(data) + elif is_arrow_ipc(data): arrow_table = arrow_table_from_ipc(data) - else: + elif is_dataframe_api_obj(data): arrow_table = arrow_table_from_dataframe_protocol(data) + else: + raise ValueError( + "input must be a DuckDB connection, DataFrame-like, an Arrow IPC " + "table, or an Arrow object exporting the Arrow C Stream interface." + ) conn.register(table, arrow_table) self._conn = conn super().__init__(