From 2410b9ae0a5b2a2778e5e274a8ef05f786faf6de Mon Sep 17 00:00:00 2001
From: CI <ci-bot@example.com>
Date: Tue, 25 Jun 2024 14:20:04 +0000
Subject: [PATCH] Deployed cf07d13 to 0.5.1 with MkDocs 1.6.0 and mike 2.1.2

---
 0.5.1/api/arrow/index.html      | 101 ++++++++++++++++++++++++++------
 0.5.1/search/search_index.json  |   2 +-
 0.5.1/sitemap.xml               |  16 ++---
 0.5.1/sitemap.xml.gz            | Bin 278 -> 278 bytes
 0.5.1/usage/index.html          |  20 ++++++-
 latest/api/arrow/index.html     | 101 ++++++++++++++++++++++++++------
 latest/search/search_index.json |   2 +-
 latest/sitemap.xml              |  16 ++---
 latest/sitemap.xml.gz           | Bin 278 -> 278 bytes
 latest/usage/index.html         |  20 ++++++-
 10 files changed, 220 insertions(+), 58 deletions(-)
diff --git a/0.5.1/api/arrow/index.html b/0.5.1/api/arrow/index.html
index 17bb1c8..304788a 100644
--- a/0.5.1/api/arrow/index.html
+++ b/0.5.1/api/arrow/index.html
@@ -916,7 +916,7 @@ <h3 id="stac_geoparquet.arrow.parse_stac_items_to_arrow" class="doc doc-heading"
     <span class="o">*</span><span class="p">,</span>
     <span class="n">chunk_size</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">=</span> <span class="mi">8192</span><span class="p">,</span>
     <span class="n">schema</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Schema" href="https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema">Schema</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow._schema.models.InferredSchema">InferredSchema</span></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
-<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></span><span class="p">]</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
@@ -968,10 +968,10 @@ <h3 id="stac_geoparquet.arrow.parse_stac_items_to_arrow" class="doc doc-heading"
 <p><span class="doc-section-title">Returns:</span></p>
     <ul>
         <li class="doc-section-item field-body">
-              <code><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a>[<a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a>]</code>
+              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></code>
           –
           <div class="doc-md-description">
-            <p>an iterable of pyarrow RecordBatches with the STAC-GeoParquet representation of items.</p>
+            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p>
           </div>
         </li>
     </ul>
@@ -994,7 +994,7 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_arrow" class="doc doc-heading
     <span class="n">chunk_size</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">=</span> <span class="n">DEFAULT_JSON_CHUNK_SIZE</span><span class="p">,</span>
     <span class="n">schema</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Schema" href="https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema">Schema</a></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">limit</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
-<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterator" href="https://docs.python.org/3/library/typing.html#typing.Iterator">Iterator</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></span><span class="p">]</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
@@ -1054,13 +1054,13 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_arrow" class="doc doc-heading
     </ul>
 
 
-<p><span class="doc-section-title">Yields:</span></p>
+<p><span class="doc-section-title">Returns:</span></p>
     <ul>
         <li class="doc-section-item field-body">
-              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></code>
+              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></code>
           –
           <div class="doc-md-description">
-            <p>Arrow RecordBatch with a single chunk of Item data.</p>
+            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p>
           </div>
         </li>
     </ul>
@@ -1255,6 +1255,8 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_parquet" class="doc doc-headi
           </div>
         </li>
     </ul>
+      <p>All other keyword args are passed on to
+<a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter"><code>pyarrow.parquet.ParquetWriter</code></a>.</p>
 
     </div>
 
@@ -1268,12 +1270,43 @@ <h3 id="stac_geoparquet.arrow.stac_table_to_items" class="doc doc-heading">
 
 
 <a href="#stac_geoparquet.arrow.stac_table_to_items" class="headerlink" title="Permanent link">&para;</a></h3>
-<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_items</span><span class="p">(</span><span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a></span><span class="p">]</span>
+<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_items</span><span class="p">(</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a></span><span class="p">]</span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
 
-      <p>Convert a STAC Table to a generator of STAC Item <code>dict</code>s</p>
+      <p>Convert STAC Arrow to a generator of STAC Item <code>dict</code>s.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+          <b><code>table</code></b>
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
+          –
+          <div class="doc-md-description">
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
+          </div>
+        </li>
+    </ul>
+
+
+<p><span class="doc-section-title">Yields:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+              <code><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a>]</code>
+          –
+          <div class="doc-md-description">
+            <p>A STAC <code>dict</code> for each input row.</p>
+          </div>
+        </li>
+    </ul>
 
     </div>
 
@@ -1287,12 +1320,40 @@ <h3 id="stac_geoparquet.arrow.stac_table_to_ndjson" class="doc doc-heading">
 
 
 <a href="#stac_geoparquet.arrow.stac_table_to_ndjson" class="headerlink" title="Permanent link">&para;</a></h3>
-<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_ndjson</span><span class="p">(</span><span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">,</span> <span class="n">dest</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a></span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span>
+<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_ndjson</span><span class="p">(</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+    <span class="n">dest</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a></span><span class="p">],</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
 
-      <p>Write a STAC Table to a newline-delimited JSON file.</p>
+      <p>Write STAC Arrow to a newline-delimited JSON file.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+          <b><code>table</code></b>
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
+          –
+          <div class="doc-md-description">
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
+          </div>
+        </li>
+        <li class="doc-section-item field-body">
+          <b><code>dest</code></b>
+              (<code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a> | <a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a> | <a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a>]</code>)
+          –
+          <div class="doc-md-description">
+            <p>The destination where newline-delimited JSON should be written.</p>
+          </div>
+        </li>
+    </ul>
 
     </div>
 
@@ -1307,8 +1368,8 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
 
 <a href="#stac_geoparquet.arrow.to_parquet" class="headerlink" title="Permanent link">&para;</a></h3>
 <div class="doc-signature codehilite"><pre><span></span><code><span class="nf">to_parquet</span><span class="p">(</span>
-    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">,</span>
-    <span class="n">where</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></span><span class="p">,</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+    <span class="n">output_path</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span><span class="p">,</span>
     <span class="o">*</span><span class="p">,</span>
     <span class="n">schema_version</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-internal" title="stac_geoparquet.arrow._constants.SUPPORTED_PARQUET_SCHEMA_VERSIONS" href="#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS">SUPPORTED_PARQUET_SCHEMA_VERSIONS</a></span> <span class="o">=</span> <span class="n">DEFAULT_PARQUET_SCHEMA_VERSION</span><span class="p">,</span>
     <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></span>
@@ -1325,15 +1386,19 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
     <ul>
         <li class="doc-section-item field-body">
           <b><code>table</code></b>
-              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></code>)
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
           –
           <div class="doc-md-description">
-            <p>The table to write to Parquet</p>
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
           </div>
         </li>
         <li class="doc-section-item field-body">
-          <b><code>where</code></b>
-              (<code><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></code>)
+          <b><code>output_path</code></b>
+              (<code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a> | <a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></code>)
           –
           <div class="doc-md-description">
             <p>The destination for saving.</p>
@@ -1354,6 +1419,8 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
           </div>
         </li>
     </ul>
+      <p>All other keyword args are passed on to
+<a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter"><code>pyarrow.parquet.ParquetWriter</code></a>.</p>
 
     </div>
 
diff --git a/0.5.1/search/search_index.json b/0.5.1/search/search_index.json
index 582ab72..e02f440 100644
--- a/0.5.1/search/search_index.json
+++ b/0.5.1/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"STAC-geoparquet","text":"<p>Convert STAC items between JSON, GeoParquet, pgstac, and Delta Lake.</p>"},{"location":"#purpose","title":"Purpose","text":"<p>The STAC spec defines a JSON-based schema. But it can be hard to manage and search through many millions of STAC items in JSON format. For one, JSON is very large on disk. And you need to parse the entire JSON data into memory to extract just a small piece of information, say the <code>datetime</code> and one <code>asset</code> of an Item.</p> <p>GeoParquet can be a good complement to JSON for many bulk-access and analytic use cases. While STAC Items are commonly distributed as individual JSON files on object storage or through a STAC API, STAC GeoParquet allows users to access a large number of STAC items in bulk without making repeated HTTP requests.</p> <p>For analytic questions like \"find the items in the Sentinel-2 collection in June 2024 over New York City with cloud cover of less than 20%\" it can be much, much faster to find the relevant data from a GeoParquet source than from JSON, because GeoParquet needs to load only the relevant columns for that query, not the full data.</p> <p>See the STAC-GeoParquet specification for details on the exact schema of the written Parquet files.</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Documentation website</p>"},{"location":"drawbacks/","title":"Drawbacks","text":"<p>Trying to represent STAC data in GeoParquet has some drawbacks.</p>"},{"location":"drawbacks/#unable-to-represent-undefined-values","title":"Unable to represent undefined values","text":"<p>Parquet is unable to represent the difference between undefined and null, and so is unable to perfectly round-trip STAC data with undefined values.</p> <p>In JSON a value can have one of three states: defined, undefined, or null. The <code>\"b\"</code> key in the next three examples illustrates this:</p> <p>Defined:</p> <pre><code>{\n  \"a\": 1,\n  \"b\": \"foo\"\n}\n</code></pre> <p>Undefined:</p> <pre><code>{\n  \"a\": 2\n}\n</code></pre> <p>Null:</p> <pre><code>{\n  \"a\": 3,\n  \"b\": null\n}\n</code></pre> <p>Because Parquet is a columnar format, it is only able to represent undefined at the column level. So if those three JSON items above were converted to Parquet, the column <code>\"b\"</code> would exist because it exists in the first and third item, and the second item would have <code>\"b\"</code> inferred as <code>null</code>:</p> a b 1 \"foo\" 2 null 3 null <p>Then when the second item is converted back to JSON, it will be returned as</p> <pre><code>{\n  \"a\": 2\n  \"b\": null\n}\n</code></pre> <p>which is not strictly equal to the input.</p>"},{"location":"drawbacks/#schema-difficulties","title":"Schema difficulties","text":"<p>JSON is schemaless while Parquet requires a strict schema, and it can be very difficult to unite these two systems. This is such an important consideration that we have a documentation page just to discuss this point.</p>"},{"location":"schema/","title":"Schema considerations","text":"<p>A STAC Item is a JSON object to describe an external geospatial dataset. The STAC specification defines a common core, plus a variety of extensions. Additionally, STAC Items may include custom extensions outside the common ones. Crucially, the majority of the specified fields in the core spec and extensions define optional keys. Those keys often differ across STAC collections and may even differ within a single collection across items.</p> <p>STAC's flexibility is a blessing and a curse. The flexibility of schemaless JSON allows for very easy writing as each object can be dumped separately to JSON. Every item is allowed to have a different schema. And newer items are free to have a different schema than older items in the same collection. But this write-time flexibility makes it harder to read as there are no guarantees (outside STAC's few required fields) about what fields exist.</p> <p>Parquet is the complete opposite of JSON. Parquet has a strict schema that must be known before writing can start. This puts the burden of work onto the writer instead of the reader. Reading Parquet is very efficient because the file's metadata defines the exact schema of every record. This also enables use cases like reading specific columns that would not be possible without a strict schema.</p> <p>This conversion from schemaless to strict-schema is the difficult part of converting STAC from JSON to GeoParquet, especially for large input datasets like STAC that are often larger than memory.</p>"},{"location":"schema/#full-scan-over-input-data","title":"Full scan over input data","text":"<p>The most foolproof way to convert STAC JSON to GeoParquet is to perform a full scan over input data. This is done automatically by <code>parse_stac_ndjson_to_arrow</code> when a schema is not provided.</p> <p>This is time consuming as it requires two full passes over the input data: once to infer a common schema and again to actually write to Parquet (though items are never fully held in memory, allowing this process to scale).</p>"},{"location":"schema/#user-provided-schema","title":"User-provided schema","text":"<p>Alternatively, the user can pass in an Arrow schema themselves using the <code>schema</code> parameter of <code>parse_stac_ndjson_to_arrow</code>. This <code>schema</code> must match the on-disk schema of the the STAC JSON data.</p>"},{"location":"schema/#multiple-schemas-per-collection","title":"Multiple schemas per collection","text":"<p>It is also possible to write multiple Parquet files with STAC data where each Parquet file may have a different schema. This simplifies the conversion and writing process but makes reading and using the Parquet data harder.</p>"},{"location":"schema/#merging-data-with-schema-mismatch","title":"Merging data with schema mismatch","text":"<p>If you've created STAC GeoParquet data where the schema has updated, you can use <code>pyarrow.concat_tables</code> with <code>promote_options=\"permissive\"</code> to combine multiple STAC GeoParquet files.</p> <pre><code>import pyarrow as pa\nimport pyarrow.parquet as pq\n\ntable_1 = pq.read_table(\"stac1.parquet\")\ntable_2 = pq.read_table(\"stac2.parquet\")\ncombined_table = pa.concat_tables([table1, table2], promote_options=\"permissive\")\n</code></pre>"},{"location":"schema/#future-work","title":"Future work","text":"<p>Schema operations is an area where future work can improve reliability and ease of use of STAC GeoParquet.</p> <p>It's possible that in the future we could automatically infer an Arrow schema from the STAC specification's published JSON Schema files. If you're interested in this, open an issue and discuss.</p>"},{"location":"usage/","title":"Usage","text":"<p>Except for the legacy API, Apache Arrow is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p> <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>"},{"location":"usage/#dictjson-arrow-conversion","title":"<code>dict</code>/JSON - Arrow conversion","text":""},{"location":"usage/#convert-dicts-to-arrow","title":"Convert <code>dict</code>s to Arrow","text":"<p>Use <code>parse_stac_items_to_arrow</code> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <code>pystac.Item</code> objects.</p>"},{"location":"usage/#convert-json-to-arrow","title":"Convert JSON to Arrow","text":"<p><code>parse_stac_ndjson_to_arrow</code> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>"},{"location":"usage/#convert-arrow-to-dicts","title":"Convert Arrow to <code>dict</code>s","text":"<p>Use <code>stac_table_to_items</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#convert-arrow-to-json","title":"Convert Arrow to JSON","text":"<p>Use <code>stac_table_to_ndjson</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#parquet","title":"Parquet","text":"<p>Use <code>to_parquet</code> to write STAC Arrow data in memory. This is a special function to ensure that GeoParquet 1.0 or 1.1 metadata is written to the Parquet file.</p> <p><code>parse_stac_ndjson_to_parquet</code> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p> <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <code>pyarrow.parquet.read_table</code> or <code>pyarrow.parquet.ParquetFile</code> directly to read the STAC GeoParquet data back into Arrow.</p>"},{"location":"usage/#delta-lake","title":"Delta Lake","text":"<p>Use <code>parse_stac_ndjson_to_delta_lake</code> to read (newline-delimited) JSON on disk and write out to a Delta Lake table.</p> <p>No special API is required for reading a STAC Delta Lake table back into Arrow. You can use the <code>DeltaTable</code> class directly to read the data back into Arrow.</p> <p>Important</p> <p>Arrow has a null data type, where every value in the column is always null, but Delta Lake does not. This means that for any column inferred to have a <code>null</code> data type, writing to Delta Lake will error with <pre><code>_internal.SchemaMismatchError: Invalid data type for Delta Lake: Null\n</code></pre></p> <p>This is a problem because if all items in a STAC Collection have a <code>null</code> JSON key, it gets inferred as an Arrow <code>null</code> type. For example, in the <code>3dep-lidar-copc</code> collection in the tests, it has <code>start_datetime</code> and <code>end_datetime</code> fields, and so according to the spec, <code>datetime</code> is always <code>null</code>. This column would need to be casted to a timestamp type before being written to Delta Lake.</p> <p>This means we cannot write this collection to Delta Lake solely with automatic schema inference.</p> <p>In such cases, users may need to manually update the inferred schema to cast any <code>null</code> type to another Delta Lake-compatible type.</p>"},{"location":"api/arrow/","title":"<code>stac_geoparquet.arrow</code>","text":"<p>Arrow-based format conversions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow","title":"stac_geoparquet.arrow","text":""},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_JSON_CHUNK_SIZE","title":"DEFAULT_JSON_CHUNK_SIZE  <code>module-attribute</code>","text":"<pre><code>DEFAULT_JSON_CHUNK_SIZE = 65536\n</code></pre> <p>The default chunk size to use for reading JSON into memory.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_PARQUET_SCHEMA_VERSION","title":"DEFAULT_PARQUET_SCHEMA_VERSION  <code>module-attribute</code>","text":"<pre><code>DEFAULT_PARQUET_SCHEMA_VERSION: SUPPORTED_PARQUET_SCHEMA_VERSIONS = '1.1.0'\n</code></pre> <p>The default GeoParquet schema version written to file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS","title":"SUPPORTED_PARQUET_SCHEMA_VERSIONS  <code>module-attribute</code>","text":"<pre><code>SUPPORTED_PARQUET_SCHEMA_VERSIONS = Literal['1.0.0', '1.1.0']\n</code></pre> <p>A Literal type with the supported GeoParquet schema versions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow","title":"parse_stac_items_to_arrow","text":"<pre><code>parse_stac_items_to_arrow(\n    items: Iterable[Item | dict[str, Any]],\n    *,\n    chunk_size: int = 8192,\n    schema: Schema | InferredSchema | None = None\n) -&gt; Iterable[RecordBatch]\n</code></pre> <p>Parse a collection of STAC Items to an iterable of <code>pyarrow.RecordBatch</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the Table, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Iterable[Item | dict[str, Any]]</code>)           \u2013            <p>the STAC Items to convert</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>8192</code> )           \u2013            <p>The chunk size to use for Arrow record batches. This only takes effect if <code>schema</code> is not None. When <code>schema</code> is None, the input will be parsed into a single contiguous record batch. Defaults to 8192.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema of the input data. If provided, can improve memory use; otherwise all items need to be parsed into a single array for schema inference. Defaults to None.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Iterable[RecordBatch]</code>           \u2013            <p>an iterable of pyarrow RecordBatches with the STAC-GeoParquet representation of items.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow","title":"parse_stac_ndjson_to_arrow","text":"<pre><code>parse_stac_ndjson_to_arrow(\n    path: str | Path | Iterable[str | Path],\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None\n) -&gt; Iterator[RecordBatch]\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to a generator of Arrow RecordBatches.</p> <p>Each RecordBatch in the returned iterator is guaranteed to have an identical schema, and can be used to write to one or more Parquet files.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON Items to use for schema inference</p> </li> </ul> <p>Yields:</p> <ul> <li> <code>RecordBatch</code>           \u2013            <p>Arrow RecordBatch with a single chunk of Item data.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_delta_lake","title":"parse_stac_ndjson_to_delta_lake","text":"<pre><code>parse_stac_ndjson_to_delta_lake(\n    input_path: str | Path | Iterable[str | Path],\n    table_or_uri: str | Path | DeltaTable,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to Delta Lake</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>table_or_uri</code>               (<code>str | Path | DeltaTable</code>)           \u2013            <p>A path to the output Delta Lake table</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size to use for reading JSON into memory. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>, default:                   <code>None</code> )           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>, default:                   <code>DEFAULT_PARQUET_SCHEMA_VERSION</code> )           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet","title":"parse_stac_ndjson_to_parquet","text":"<pre><code>parse_stac_ndjson_to_parquet(\n    input_path: str | Path | Iterable[str | Path],\n    output_path: str | Path,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | InferredSchema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to GeoParquet</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>A path to the output Parquet file.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>)           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>)           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_items","title":"stac_table_to_items","text":"<pre><code>stac_table_to_items(table: Table) -&gt; Iterable[dict]\n</code></pre> <p>Convert a STAC Table to a generator of STAC Item <code>dict</code>s</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson","title":"stac_table_to_ndjson","text":"<pre><code>stac_table_to_ndjson(table: Table, dest: str | Path | PathLike[bytes]) -&gt; None\n</code></pre> <p>Write a STAC Table to a newline-delimited JSON file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.to_parquet","title":"to_parquet","text":"<pre><code>to_parquet(\n    table: Table,\n    where: Any,\n    *,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Write an Arrow table with STAC data to GeoParquet</p> <p>This writes metadata compliant with either GeoParquet 1.0 or 1.1.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table</code>)           \u2013            <p>The table to write to Parquet</p> </li> <li> <code>where</code>               (<code>Any</code>)           \u2013            <p>The destination for saving.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/legacy/","title":"Direct GeoPandas conversion (Legacy)","text":"<p>The API listed here was the initial non-Arrow-based STAC-GeoParquet implementation, converting between JSON and GeoPandas directly. For large collections of STAC items, using the new Arrow-based functionality (under the <code>stac_geoparquet.arrow</code> namespace) will be more performant.</p> <p>Note that <code>stac_geoparquet</code> lifts the keys in the item <code>properties</code> up to the top level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <pre><code>&gt;&gt;&gt; import requests\n&gt;&gt;&gt; import stac_geoparquet.arrow\n&gt;&gt;&gt; import pyarrow.parquet\n&gt;&gt;&gt; import pyarrow as pa\n\n&gt;&gt;&gt; items = requests.get(\n...     \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items\"\n... ).json()[\"features\"]\n&gt;&gt;&gt; table = pa.Table.from_batches(stac_geoparquet.arrow.parse_stac_items_to_arrow(items))\n&gt;&gt;&gt; stac_geoparquet.arrow.to_parquet(table, \"items.parquet\")\n&gt;&gt;&gt; table2 = pyarrow.parquet.read_table(\"items.parquet\")\n&gt;&gt;&gt; items2 = list(stac_geoparquet.arrow.stac_table_to_items(table2))\n</code></pre>"},{"location":"api/legacy/#stac_geoparquet.to_geodataframe","title":"stac_geoparquet.to_geodataframe","text":"<pre><code>to_geodataframe(\n    items: Sequence[dict[str, Any]],\n    add_self_link: bool = False,\n    dtype_backend: DTYPE_BACKEND | None = None,\n    datetime_precision: str = \"ns\",\n) -&gt; GeoDataFrame\n</code></pre> <p>Convert a sequence of STAC items to a <code>geopandas.GeoDataFrame</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Sequence[dict[str, Any]]</code>)           \u2013            <p>A sequence of STAC items.</p> </li> <li> <code>add_self_link</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>bool, default False Add the absolute link (if available) to the source STAC Item as a separate column named \"self_link\"</p> </li> <li> <code>dtype_backend</code>               (<code>DTYPE_BACKEND | None</code>, default:                   <code>None</code> )           \u2013            <p><code>{'pyarrow', 'numpy_nullable'}</code>, optional The dtype backend to use for storing arrays.</p> <p>By default, this will use 'numpy_nullable' and emit a FutureWarning that the default will change to 'pyarrow' in the next release.</p> <p>Set to 'numpy_nullable' to silence the warning and accept the old behavior.</p> <p>Set to 'pyarrow' to silence the warning and accept the new behavior.</p> <p>There are some difference in the output as well: with <code>dtype_backend=\"pyarrow\"</code>, struct-like fields will explicitly contain null values for fields that appear in only some of the records. For example, given an <code>assets</code> like::</p> <pre><code>{\n    \"a\": {\n        \"href\": \"a.tif\",\n    },\n    \"b\": {\n        \"href\": \"b.tif\",\n        \"title\": \"B\",\n    }\n}\n</code></pre> <p>The <code>assets</code> field of the output for the first row with <code>dtype_backend=\"numpy_nullable\"</code> will be a Python dictionary with just <code>{\"href\": \"a.tiff\"}</code>.</p> <p>With <code>dtype_backend=\"pyarrow\"</code>, this will be a pyarrow struct with fields <code>{\"href\": \"a.tif\", \"title\", None}</code>. pyarrow will infer that the struct field <code>asset.title</code> is nullable.</p> </li> <li> <code>datetime_precision</code>               (<code>str</code>, default:                   <code>'ns'</code> )           \u2013            <p>str, default \"ns\" The precision to use for the datetime columns. For example, \"us\" is microsecond and \"ns\" is nanosecond.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>GeoDataFrame</code>           \u2013            <p>The converted GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_item_collection","title":"stac_geoparquet.to_item_collection","text":"<pre><code>to_item_collection(df: GeoDataFrame) -&gt; ItemCollection\n</code></pre> <p>Convert a GeoDataFrame of STAC items to a <code>pystac.ItemCollection</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>GeoDataFrame</code>)           \u2013            <p>A GeoDataFrame with a schema similar to that exported by stac-geoparquet.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ItemCollection</code>           \u2013            <p>The converted <code>ItemCollection</code>. There will be one record / feature per row in the in the GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_dict","title":"stac_geoparquet.to_dict","text":"<pre><code>to_dict(record: dict) -&gt; dict\n</code></pre> <p>Create a dictionary representing a STAC item from a row of the GeoDataFrame.</p> <p>Parameters:</p> <ul> <li> <code>record</code>               (<code>dict</code>)           \u2013            <p>dict</p> </li> </ul>"},{"location":"api/pgstac/","title":"pgstac integration","text":"<p><code>stac_geoparquet.pgstac_reader</code> has some helpers for working with items coming from a <code>pgstac.items</code> table. It takes care of</p> <ul> <li>Rehydrating the dehydrated items</li> <li>Partitioning by time</li> <li>Injecting dynamic links and assets from a STAC API</li> </ul>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig","title":"stac_geoparquet.pgstac_reader.CollectionConfig  <code>dataclass</code>","text":"<p>Additional collection-based configuration to inject, matching the dynamic properties from the API.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection","title":"collection  <code>property</code>","text":"<pre><code>collection: Collection\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection_id","title":"collection_id  <code>instance-attribute</code>","text":"<pre><code>collection_id: str\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.partition_frequency","title":"partition_frequency  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>partition_frequency: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.render_config","title":"render_config  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>render_config: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.should_inject_dynamic_properties","title":"should_inject_dynamic_properties  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>should_inject_dynamic_properties: bool = True\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.stac_api","title":"stac_api  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>stac_api: str = 'https://planetarycomputer.microsoft.com/api/stac/v1'\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__init__","title":"__init__","text":"<pre><code>__init__(\n    collection_id: str,\n    partition_frequency: str | None = None,\n    stac_api: str = \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n    should_inject_dynamic_properties: bool = True,\n    render_config: str | None = None,\n) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__post_init__","title":"__post_init__","text":"<pre><code>__post_init__() -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_collection","title":"export_collection","text":"<pre><code>export_collection(\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; list[str | None]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition","title":"export_partition","text":"<pre><code>export_partition(\n    conninfo: str,\n    query: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any] | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition_for_endpoints","title":"export_partition_for_endpoints","text":"<pre><code>export_partition_for_endpoints(\n    endpoints: tuple[datetime, datetime],\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    part_number: int | None = None,\n    total: int | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre> <p>Export results for a pair of endpoints.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.generate_endpoints","title":"generate_endpoints","text":"<pre><code>generate_endpoints(\n    since: datetime | None = None,\n) -&gt; list[tuple[datetime, datetime]]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_assets","title":"inject_assets","text":"<pre><code>inject_assets(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_links","title":"inject_links","text":"<pre><code>inject_links(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.make_pgstac_items","title":"make_pgstac_items","text":"<pre><code>make_pgstac_items(\n    records: list[tuple[str, str, str, datetime, datetime, dict[str, Any]]],\n    base_item: dict[str, Any],\n) -&gt; list[dict[str, Any]]\n</code></pre> <p>Make STAC items out of pgstac records.</p> <p>Parameters:</p> <ul> <li> <code>records</code>               (<code>list[tuple[str, str, str, datetime, datetime, dict[str, Any]]]</code>)           \u2013            <p>list[tuple] The dehydrated records from pgstac.items table.</p> </li> <li> <code>base_item</code>               (<code>dict[str, Any]</code>)           \u2013            <p>dict[str, Any] The base item from the <code>collection_base_item</code> pgstac function for this collection. Used for rehydration</p> </li> </ul>"},{"location":"spec/stac-geoparquet-spec/","title":"STAC GeoParquet Specification","text":""},{"location":"spec/stac-geoparquet-spec/#overview","title":"Overview","text":"<p>This document specifies how to map a set of STAC Items into GeoParquet. It is directly inspired by the STAC GeoParquet library, but aims to provide guidance for anyone putting STAC data into GeoParquet.</p>"},{"location":"spec/stac-geoparquet-spec/#use-cases","title":"Use cases","text":"<ul> <li>Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file.</li> <li>As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON.</li> <li>Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#guidelines","title":"Guidelines","text":"<p>Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet.</p> Field GeoParquet Type Required Details type String Optional This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet stac_extensions List of Strings Required This column is required, but can be empty if no STAC extensions were used id String Required Required, should be unique within each collection geometry Binary (WKB) Required For GeoParquet 1.0 this must be well-known Binary bbox Struct of Floats Required Can be a 4 or 6 value struct, depending on dimension of the data. It must conform to the \"Bounding Box Columns\" definition of GeoParquet 1.1. links List of Link structs Required See Link Struct for more info assets An Assets struct Required See Asset Struct for more info collection String Optional The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata property columns varies - Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field <ul> <li>Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.</li> <li>Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data</li> <li>Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.</li> <li>STAC GeoParquet does not support properties that are named such that they collide with a top-level key.</li> <li>datetime columns should be stored as a native timestamp, not as a string</li> <li>The Collection JSON should be included in the Parquet metadata. See Collection JSON below.</li> <li>Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. <code>proj:geometry</code>) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#link-struct","title":"Link Struct","text":"<p>The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones:</p> Field Name Type Description href string REQUIRED. The actual link in the format of an URL. Relative and absolute links are both allowed. rel string REQUIRED. Relationship between the current document and the linked document. See chapter \"Relation types\" for more information. type string Media type of the referenced entity. title string A human readable title to be used in rendered displays of the link. <p>See Link Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#asset-struct","title":"Asset Struct","text":"<p>The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields:</p> Field Name Type Description href string REQUIRED. URI to the asset object. Relative and absolute URI are both allowed. title string The displayed title for clients and users. description string A description of the Asset providing additional details, such as how it was processed or created. CommonMark 0.29 syntax MAY be used for rich text representation. type string Media type of the asset. See the common media types in the best practice doc for commonly used asset types. roles [string] The semantic roles of the asset, similar to the use of <code>rel</code> in links. <p>Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet</p> <p>To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection.</p> <p>See Asset Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#including-a-stac-collection-json-in-a-stac-geoparquet-collection","title":"Including a STAC Collection JSON in a STAC Geoparquet Collection","text":"<p>To make a stac-geoparquet file a fully self-contained representation, you can include the Collection JSON in the Parquet metadata. If present in the Parquet file metadata, the key must be <code>stac:collection</code> and the value must be a JSON string with the Collection JSON.</p>"},{"location":"spec/stac-geoparquet-spec/#referencing-a-stac-geoparquet-collections-in-a-stac-collection-json","title":"Referencing a STAC Geoparquet Collections in a STAC Collection JSON","text":"<p>A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an Asset Object at the collection level of the STAC JSON that includes the <code>application/vnd.apache.parquet</code> Media type and <code>collection-mirror</code> Role type to describe the function of the Geoparquet STAC Collection Asset.</p> <p>For example:</p> Field Name Type Value href string s3://example/uri/to/file.parquet title string An example STAC GeoParquet. description string Example description. type string <code>application/vnd.apache.parquet</code> roles [string] [collection-mirror]* <p>*Note the IANA has not approved the new Media type <code>application/vnd.apache.parquet</code> yet, it's been submitted for approval.</p> <p>The description should ideally include details about the spatial partitioning method.</p>"},{"location":"spec/stac-geoparquet-spec/#mapping-to-other-geospatial-data-formats","title":"Mapping to other geospatial data formats","text":"<p>The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"STAC-geoparquet","text":"<p>Convert STAC items between JSON, GeoParquet, pgstac, and Delta Lake.</p>"},{"location":"#purpose","title":"Purpose","text":"<p>The STAC spec defines a JSON-based schema. But it can be hard to manage and search through many millions of STAC items in JSON format. For one, JSON is very large on disk. And you need to parse the entire JSON data into memory to extract just a small piece of information, say the <code>datetime</code> and one <code>asset</code> of an Item.</p> <p>GeoParquet can be a good complement to JSON for many bulk-access and analytic use cases. While STAC Items are commonly distributed as individual JSON files on object storage or through a STAC API, STAC GeoParquet allows users to access a large number of STAC items in bulk without making repeated HTTP requests.</p> <p>For analytic questions like \"find the items in the Sentinel-2 collection in June 2024 over New York City with cloud cover of less than 20%\" it can be much, much faster to find the relevant data from a GeoParquet source than from JSON, because GeoParquet needs to load only the relevant columns for that query, not the full data.</p> <p>See the STAC-GeoParquet specification for details on the exact schema of the written Parquet files.</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Documentation website</p>"},{"location":"drawbacks/","title":"Drawbacks","text":"<p>Trying to represent STAC data in GeoParquet has some drawbacks.</p>"},{"location":"drawbacks/#unable-to-represent-undefined-values","title":"Unable to represent undefined values","text":"<p>Parquet is unable to represent the difference between undefined and null, and so is unable to perfectly round-trip STAC data with undefined values.</p> <p>In JSON a value can have one of three states: defined, undefined, or null. The <code>\"b\"</code> key in the next three examples illustrates this:</p> <p>Defined:</p> <pre><code>{\n  \"a\": 1,\n  \"b\": \"foo\"\n}\n</code></pre> <p>Undefined:</p> <pre><code>{\n  \"a\": 2\n}\n</code></pre> <p>Null:</p> <pre><code>{\n  \"a\": 3,\n  \"b\": null\n}\n</code></pre> <p>Because Parquet is a columnar format, it is only able to represent undefined at the column level. So if those three JSON items above were converted to Parquet, the column <code>\"b\"</code> would exist because it exists in the first and third item, and the second item would have <code>\"b\"</code> inferred as <code>null</code>:</p> a b 1 \"foo\" 2 null 3 null <p>Then when the second item is converted back to JSON, it will be returned as</p> <pre><code>{\n  \"a\": 2\n  \"b\": null\n}\n</code></pre> <p>which is not strictly equal to the input.</p>"},{"location":"drawbacks/#schema-difficulties","title":"Schema difficulties","text":"<p>JSON is schemaless while Parquet requires a strict schema, and it can be very difficult to unite these two systems. This is such an important consideration that we have a documentation page just to discuss this point.</p>"},{"location":"schema/","title":"Schema considerations","text":"<p>A STAC Item is a JSON object to describe an external geospatial dataset. The STAC specification defines a common core, plus a variety of extensions. Additionally, STAC Items may include custom extensions outside the common ones. Crucially, the majority of the specified fields in the core spec and extensions define optional keys. Those keys often differ across STAC collections and may even differ within a single collection across items.</p> <p>STAC's flexibility is a blessing and a curse. The flexibility of schemaless JSON allows for very easy writing as each object can be dumped separately to JSON. Every item is allowed to have a different schema. And newer items are free to have a different schema than older items in the same collection. But this write-time flexibility makes it harder to read as there are no guarantees (outside STAC's few required fields) about what fields exist.</p> <p>Parquet is the complete opposite of JSON. Parquet has a strict schema that must be known before writing can start. This puts the burden of work onto the writer instead of the reader. Reading Parquet is very efficient because the file's metadata defines the exact schema of every record. This also enables use cases like reading specific columns that would not be possible without a strict schema.</p> <p>This conversion from schemaless to strict-schema is the difficult part of converting STAC from JSON to GeoParquet, especially for large input datasets like STAC that are often larger than memory.</p>"},{"location":"schema/#full-scan-over-input-data","title":"Full scan over input data","text":"<p>The most foolproof way to convert STAC JSON to GeoParquet is to perform a full scan over input data. This is done automatically by <code>parse_stac_ndjson_to_arrow</code> when a schema is not provided.</p> <p>This is time consuming as it requires two full passes over the input data: once to infer a common schema and again to actually write to Parquet (though items are never fully held in memory, allowing this process to scale).</p>"},{"location":"schema/#user-provided-schema","title":"User-provided schema","text":"<p>Alternatively, the user can pass in an Arrow schema themselves using the <code>schema</code> parameter of <code>parse_stac_ndjson_to_arrow</code>. This <code>schema</code> must match the on-disk schema of the the STAC JSON data.</p>"},{"location":"schema/#multiple-schemas-per-collection","title":"Multiple schemas per collection","text":"<p>It is also possible to write multiple Parquet files with STAC data where each Parquet file may have a different schema. This simplifies the conversion and writing process but makes reading and using the Parquet data harder.</p>"},{"location":"schema/#merging-data-with-schema-mismatch","title":"Merging data with schema mismatch","text":"<p>If you've created STAC GeoParquet data where the schema has updated, you can use <code>pyarrow.concat_tables</code> with <code>promote_options=\"permissive\"</code> to combine multiple STAC GeoParquet files.</p> <pre><code>import pyarrow as pa\nimport pyarrow.parquet as pq\n\ntable_1 = pq.read_table(\"stac1.parquet\")\ntable_2 = pq.read_table(\"stac2.parquet\")\ncombined_table = pa.concat_tables([table1, table2], promote_options=\"permissive\")\n</code></pre>"},{"location":"schema/#future-work","title":"Future work","text":"<p>Schema operations is an area where future work can improve reliability and ease of use of STAC GeoParquet.</p> <p>It's possible that in the future we could automatically infer an Arrow schema from the STAC specification's published JSON Schema files. If you're interested in this, open an issue and discuss.</p>"},{"location":"usage/","title":"Usage","text":"<p>Apache Arrow is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p> <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>"},{"location":"usage/#dictjson-arrow-conversion","title":"<code>dict</code>/JSON - Arrow conversion","text":""},{"location":"usage/#convert-dicts-to-arrow","title":"Convert <code>dict</code>s to Arrow","text":"<p>Use <code>parse_stac_items_to_arrow</code> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <code>pystac.Item</code> objects.</p> <p>For example:</p> <pre><code>import pyarrow as pa\nimport pystac\n\nimport stac_geoparquet\n\nitem = pystac.read_file(\n    \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20230112T104411_R008_T29NPE_20230113T053333\"\n)\nassert isinstance(item, pystac.Item)\n\nrecord_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow([item])\ntable = record_batch_reader.read_all()\n</code></pre>"},{"location":"usage/#convert-json-to-arrow","title":"Convert JSON to Arrow","text":"<p><code>parse_stac_ndjson_to_arrow</code> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>"},{"location":"usage/#convert-arrow-to-dicts","title":"Convert Arrow to <code>dict</code>s","text":"<p>Use <code>stac_table_to_items</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#convert-arrow-to-json","title":"Convert Arrow to JSON","text":"<p>Use <code>stac_table_to_ndjson</code> to convert a table or stream of Arrow record batches of STAC data to a newline-delimited JSON file. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#parquet","title":"Parquet","text":"<p>Use <code>to_parquet</code> to write STAC Arrow data from memory to a path or file-like object. This is a special function to ensure that GeoParquet 1.0 or 1.1 metadata is written to the Parquet file.</p> <p><code>parse_stac_ndjson_to_parquet</code> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p> <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <code>pyarrow.parquet.read_table</code> or <code>pyarrow.parquet.ParquetFile</code> directly to read the STAC GeoParquet data back into Arrow.</p>"},{"location":"usage/#delta-lake","title":"Delta Lake","text":"<p>Use <code>parse_stac_ndjson_to_delta_lake</code> to read (newline-delimited) JSON on disk and write out to a Delta Lake table.</p> <p>No special API is required for reading a STAC Delta Lake table back into Arrow. You can use the <code>DeltaTable</code> class directly to read the data back into Arrow.</p> <p>Important</p> <p>Arrow has a null data type, where every value in the column is always null, but Delta Lake does not. This means that for any column inferred to have a <code>null</code> data type, writing to Delta Lake will error with <pre><code>_internal.SchemaMismatchError: Invalid data type for Delta Lake: Null\n</code></pre></p> <p>This is a problem because if all items in a STAC Collection have a <code>null</code> JSON key, it gets inferred as an Arrow <code>null</code> type. For example, in the <code>3dep-lidar-copc</code> collection in the tests, it has <code>start_datetime</code> and <code>end_datetime</code> fields, and so according to the spec, <code>datetime</code> is always <code>null</code>. This column would need to be casted to a timestamp type before being written to Delta Lake.</p> <p>This means we cannot write this collection to Delta Lake solely with automatic schema inference.</p> <p>In such cases, users may need to manually update the inferred schema to cast any <code>null</code> type to another Delta Lake-compatible type.</p>"},{"location":"api/arrow/","title":"<code>stac_geoparquet.arrow</code>","text":"<p>Arrow-based format conversions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow","title":"stac_geoparquet.arrow","text":""},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_JSON_CHUNK_SIZE","title":"DEFAULT_JSON_CHUNK_SIZE  <code>module-attribute</code>","text":"<pre><code>DEFAULT_JSON_CHUNK_SIZE = 65536\n</code></pre> <p>The default chunk size to use for reading JSON into memory.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_PARQUET_SCHEMA_VERSION","title":"DEFAULT_PARQUET_SCHEMA_VERSION  <code>module-attribute</code>","text":"<pre><code>DEFAULT_PARQUET_SCHEMA_VERSION: SUPPORTED_PARQUET_SCHEMA_VERSIONS = '1.1.0'\n</code></pre> <p>The default GeoParquet schema version written to file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS","title":"SUPPORTED_PARQUET_SCHEMA_VERSIONS  <code>module-attribute</code>","text":"<pre><code>SUPPORTED_PARQUET_SCHEMA_VERSIONS = Literal['1.0.0', '1.1.0']\n</code></pre> <p>A Literal type with the supported GeoParquet schema versions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow","title":"parse_stac_items_to_arrow","text":"<pre><code>parse_stac_items_to_arrow(\n    items: Iterable[Item | dict[str, Any]],\n    *,\n    chunk_size: int = 8192,\n    schema: Schema | InferredSchema | None = None\n) -&gt; RecordBatchReader\n</code></pre> <p>Parse a collection of STAC Items to an iterable of <code>pyarrow.RecordBatch</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the Table, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Iterable[Item | dict[str, Any]]</code>)           \u2013            <p>the STAC Items to convert</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>8192</code> )           \u2013            <p>The chunk size to use for Arrow record batches. This only takes effect if <code>schema</code> is not None. When <code>schema</code> is None, the input will be parsed into a single contiguous record batch. Defaults to 8192.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema of the input data. If provided, can improve memory use; otherwise all items need to be parsed into a single array for schema inference. Defaults to None.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>RecordBatchReader</code>           \u2013            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow","title":"parse_stac_ndjson_to_arrow","text":"<pre><code>parse_stac_ndjson_to_arrow(\n    path: str | Path | Iterable[str | Path],\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None\n) -&gt; RecordBatchReader\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to a generator of Arrow RecordBatches.</p> <p>Each RecordBatch in the returned iterator is guaranteed to have an identical schema, and can be used to write to one or more Parquet files.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON Items to use for schema inference</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>RecordBatchReader</code>           \u2013            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_delta_lake","title":"parse_stac_ndjson_to_delta_lake","text":"<pre><code>parse_stac_ndjson_to_delta_lake(\n    input_path: str | Path | Iterable[str | Path],\n    table_or_uri: str | Path | DeltaTable,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to Delta Lake</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>table_or_uri</code>               (<code>str | Path | DeltaTable</code>)           \u2013            <p>A path to the output Delta Lake table</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size to use for reading JSON into memory. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>, default:                   <code>None</code> )           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>, default:                   <code>DEFAULT_PARQUET_SCHEMA_VERSION</code> )           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet","title":"parse_stac_ndjson_to_parquet","text":"<pre><code>parse_stac_ndjson_to_parquet(\n    input_path: str | Path | Iterable[str | Path],\n    output_path: str | Path,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | InferredSchema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to GeoParquet</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>A path to the output Parquet file.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>)           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>)           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul> <p>All other keyword args are passed on to <code>pyarrow.parquet.ParquetWriter</code>.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_items","title":"stac_table_to_items","text":"<pre><code>stac_table_to_items(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n) -&gt; Iterable[dict]\n</code></pre> <p>Convert STAC Arrow to a generator of STAC Item <code>dict</code>s.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> </ul> <p>Yields:</p> <ul> <li> <code>Iterable[dict]</code>           \u2013            <p>A STAC <code>dict</code> for each input row.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson","title":"stac_table_to_ndjson","text":"<pre><code>stac_table_to_ndjson(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n    dest: str | Path | PathLike[bytes],\n) -&gt; None\n</code></pre> <p>Write STAC Arrow to a newline-delimited JSON file.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> <li> <code>dest</code>               (<code>str | Path | PathLike[bytes]</code>)           \u2013            <p>The destination where newline-delimited JSON should be written.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.to_parquet","title":"to_parquet","text":"<pre><code>to_parquet(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n    output_path: str | Path,\n    *,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Write an Arrow table with STAC data to GeoParquet</p> <p>This writes metadata compliant with either GeoParquet 1.0 or 1.1.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>The destination for saving.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul> <p>All other keyword args are passed on to <code>pyarrow.parquet.ParquetWriter</code>.</p>"},{"location":"api/legacy/","title":"Direct GeoPandas conversion (Legacy)","text":"<p>The API listed here was the initial non-Arrow-based STAC-GeoParquet implementation, converting between JSON and GeoPandas directly. For large collections of STAC items, using the new Arrow-based functionality (under the <code>stac_geoparquet.arrow</code> namespace) will be more performant.</p> <p>Note that <code>stac_geoparquet</code> lifts the keys in the item <code>properties</code> up to the top level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <pre><code>&gt;&gt;&gt; import requests\n&gt;&gt;&gt; import stac_geoparquet.arrow\n&gt;&gt;&gt; import pyarrow.parquet\n&gt;&gt;&gt; import pyarrow as pa\n\n&gt;&gt;&gt; items = requests.get(\n...     \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items\"\n... ).json()[\"features\"]\n&gt;&gt;&gt; table = pa.Table.from_batches(stac_geoparquet.arrow.parse_stac_items_to_arrow(items))\n&gt;&gt;&gt; stac_geoparquet.arrow.to_parquet(table, \"items.parquet\")\n&gt;&gt;&gt; table2 = pyarrow.parquet.read_table(\"items.parquet\")\n&gt;&gt;&gt; items2 = list(stac_geoparquet.arrow.stac_table_to_items(table2))\n</code></pre>"},{"location":"api/legacy/#stac_geoparquet.to_geodataframe","title":"stac_geoparquet.to_geodataframe","text":"<pre><code>to_geodataframe(\n    items: Sequence[dict[str, Any]],\n    add_self_link: bool = False,\n    dtype_backend: DTYPE_BACKEND | None = None,\n    datetime_precision: str = \"ns\",\n) -&gt; GeoDataFrame\n</code></pre> <p>Convert a sequence of STAC items to a <code>geopandas.GeoDataFrame</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Sequence[dict[str, Any]]</code>)           \u2013            <p>A sequence of STAC items.</p> </li> <li> <code>add_self_link</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>bool, default False Add the absolute link (if available) to the source STAC Item as a separate column named \"self_link\"</p> </li> <li> <code>dtype_backend</code>               (<code>DTYPE_BACKEND | None</code>, default:                   <code>None</code> )           \u2013            <p><code>{'pyarrow', 'numpy_nullable'}</code>, optional The dtype backend to use for storing arrays.</p> <p>By default, this will use 'numpy_nullable' and emit a FutureWarning that the default will change to 'pyarrow' in the next release.</p> <p>Set to 'numpy_nullable' to silence the warning and accept the old behavior.</p> <p>Set to 'pyarrow' to silence the warning and accept the new behavior.</p> <p>There are some difference in the output as well: with <code>dtype_backend=\"pyarrow\"</code>, struct-like fields will explicitly contain null values for fields that appear in only some of the records. For example, given an <code>assets</code> like::</p> <pre><code>{\n    \"a\": {\n        \"href\": \"a.tif\",\n    },\n    \"b\": {\n        \"href\": \"b.tif\",\n        \"title\": \"B\",\n    }\n}\n</code></pre> <p>The <code>assets</code> field of the output for the first row with <code>dtype_backend=\"numpy_nullable\"</code> will be a Python dictionary with just <code>{\"href\": \"a.tiff\"}</code>.</p> <p>With <code>dtype_backend=\"pyarrow\"</code>, this will be a pyarrow struct with fields <code>{\"href\": \"a.tif\", \"title\", None}</code>. pyarrow will infer that the struct field <code>asset.title</code> is nullable.</p> </li> <li> <code>datetime_precision</code>               (<code>str</code>, default:                   <code>'ns'</code> )           \u2013            <p>str, default \"ns\" The precision to use for the datetime columns. For example, \"us\" is microsecond and \"ns\" is nanosecond.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>GeoDataFrame</code>           \u2013            <p>The converted GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_item_collection","title":"stac_geoparquet.to_item_collection","text":"<pre><code>to_item_collection(df: GeoDataFrame) -&gt; ItemCollection\n</code></pre> <p>Convert a GeoDataFrame of STAC items to a <code>pystac.ItemCollection</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>GeoDataFrame</code>)           \u2013            <p>A GeoDataFrame with a schema similar to that exported by stac-geoparquet.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ItemCollection</code>           \u2013            <p>The converted <code>ItemCollection</code>. There will be one record / feature per row in the in the GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_dict","title":"stac_geoparquet.to_dict","text":"<pre><code>to_dict(record: dict) -&gt; dict\n</code></pre> <p>Create a dictionary representing a STAC item from a row of the GeoDataFrame.</p> <p>Parameters:</p> <ul> <li> <code>record</code>               (<code>dict</code>)           \u2013            <p>dict</p> </li> </ul>"},{"location":"api/pgstac/","title":"pgstac integration","text":"<p><code>stac_geoparquet.pgstac_reader</code> has some helpers for working with items coming from a <code>pgstac.items</code> table. It takes care of</p> <ul> <li>Rehydrating the dehydrated items</li> <li>Partitioning by time</li> <li>Injecting dynamic links and assets from a STAC API</li> </ul>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig","title":"stac_geoparquet.pgstac_reader.CollectionConfig  <code>dataclass</code>","text":"<p>Additional collection-based configuration to inject, matching the dynamic properties from the API.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection","title":"collection  <code>property</code>","text":"<pre><code>collection: Collection\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection_id","title":"collection_id  <code>instance-attribute</code>","text":"<pre><code>collection_id: str\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.partition_frequency","title":"partition_frequency  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>partition_frequency: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.render_config","title":"render_config  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>render_config: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.should_inject_dynamic_properties","title":"should_inject_dynamic_properties  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>should_inject_dynamic_properties: bool = True\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.stac_api","title":"stac_api  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>stac_api: str = 'https://planetarycomputer.microsoft.com/api/stac/v1'\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__init__","title":"__init__","text":"<pre><code>__init__(\n    collection_id: str,\n    partition_frequency: str | None = None,\n    stac_api: str = \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n    should_inject_dynamic_properties: bool = True,\n    render_config: str | None = None,\n) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__post_init__","title":"__post_init__","text":"<pre><code>__post_init__() -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_collection","title":"export_collection","text":"<pre><code>export_collection(\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; list[str | None]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition","title":"export_partition","text":"<pre><code>export_partition(\n    conninfo: str,\n    query: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any] | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition_for_endpoints","title":"export_partition_for_endpoints","text":"<pre><code>export_partition_for_endpoints(\n    endpoints: tuple[datetime, datetime],\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    part_number: int | None = None,\n    total: int | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre> <p>Export results for a pair of endpoints.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.generate_endpoints","title":"generate_endpoints","text":"<pre><code>generate_endpoints(\n    since: datetime | None = None,\n) -&gt; list[tuple[datetime, datetime]]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_assets","title":"inject_assets","text":"<pre><code>inject_assets(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_links","title":"inject_links","text":"<pre><code>inject_links(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.make_pgstac_items","title":"make_pgstac_items","text":"<pre><code>make_pgstac_items(\n    records: list[tuple[str, str, str, datetime, datetime, dict[str, Any]]],\n    base_item: dict[str, Any],\n) -&gt; list[dict[str, Any]]\n</code></pre> <p>Make STAC items out of pgstac records.</p> <p>Parameters:</p> <ul> <li> <code>records</code>               (<code>list[tuple[str, str, str, datetime, datetime, dict[str, Any]]]</code>)           \u2013            <p>list[tuple] The dehydrated records from pgstac.items table.</p> </li> <li> <code>base_item</code>               (<code>dict[str, Any]</code>)           \u2013            <p>dict[str, Any] The base item from the <code>collection_base_item</code> pgstac function for this collection. Used for rehydration</p> </li> </ul>"},{"location":"spec/stac-geoparquet-spec/","title":"STAC GeoParquet Specification","text":""},{"location":"spec/stac-geoparquet-spec/#overview","title":"Overview","text":"<p>This document specifies how to map a set of STAC Items into GeoParquet. It is directly inspired by the STAC GeoParquet library, but aims to provide guidance for anyone putting STAC data into GeoParquet.</p>"},{"location":"spec/stac-geoparquet-spec/#use-cases","title":"Use cases","text":"<ul> <li>Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file.</li> <li>As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON.</li> <li>Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#guidelines","title":"Guidelines","text":"<p>Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet.</p> Field GeoParquet Type Required Details type String Optional This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet stac_extensions List of Strings Required This column is required, but can be empty if no STAC extensions were used id String Required Required, should be unique within each collection geometry Binary (WKB) Required For GeoParquet 1.0 this must be well-known Binary bbox Struct of Floats Required Can be a 4 or 6 value struct, depending on dimension of the data. It must conform to the \"Bounding Box Columns\" definition of GeoParquet 1.1. links List of Link structs Required See Link Struct for more info assets An Assets struct Required See Asset Struct for more info collection String Optional The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata property columns varies - Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field <ul> <li>Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.</li> <li>Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data</li> <li>Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.</li> <li>STAC GeoParquet does not support properties that are named such that they collide with a top-level key.</li> <li>datetime columns should be stored as a native timestamp, not as a string</li> <li>The Collection JSON should be included in the Parquet metadata. See Collection JSON below.</li> <li>Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. <code>proj:geometry</code>) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#link-struct","title":"Link Struct","text":"<p>The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones:</p> Field Name Type Description href string REQUIRED. The actual link in the format of an URL. Relative and absolute links are both allowed. rel string REQUIRED. Relationship between the current document and the linked document. See chapter \"Relation types\" for more information. type string Media type of the referenced entity. title string A human readable title to be used in rendered displays of the link. <p>See Link Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#asset-struct","title":"Asset Struct","text":"<p>The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields:</p> Field Name Type Description href string REQUIRED. URI to the asset object. Relative and absolute URI are both allowed. title string The displayed title for clients and users. description string A description of the Asset providing additional details, such as how it was processed or created. CommonMark 0.29 syntax MAY be used for rich text representation. type string Media type of the asset. See the common media types in the best practice doc for commonly used asset types. roles [string] The semantic roles of the asset, similar to the use of <code>rel</code> in links. <p>Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet</p> <p>To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection.</p> <p>See Asset Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#including-a-stac-collection-json-in-a-stac-geoparquet-collection","title":"Including a STAC Collection JSON in a STAC Geoparquet Collection","text":"<p>To make a stac-geoparquet file a fully self-contained representation, you can include the Collection JSON in the Parquet metadata. If present in the Parquet file metadata, the key must be <code>stac:collection</code> and the value must be a JSON string with the Collection JSON.</p>"},{"location":"spec/stac-geoparquet-spec/#referencing-a-stac-geoparquet-collections-in-a-stac-collection-json","title":"Referencing a STAC Geoparquet Collections in a STAC Collection JSON","text":"<p>A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an Asset Object at the collection level of the STAC JSON that includes the <code>application/vnd.apache.parquet</code> Media type and <code>collection-mirror</code> Role type to describe the function of the Geoparquet STAC Collection Asset.</p> <p>For example:</p> Field Name Type Value href string s3://example/uri/to/file.parquet title string An example STAC GeoParquet. description string Example description. type string <code>application/vnd.apache.parquet</code> roles [string] [collection-mirror]* <p>*Note the IANA has not approved the new Media type <code>application/vnd.apache.parquet</code> yet, it's been submitted for approval.</p> <p>The description should ideally include details about the spatial partitioning method.</p>"},{"location":"spec/stac-geoparquet-spec/#mapping-to-other-geospatial-data-formats","title":"Mapping to other geospatial data formats","text":"<p>The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that.</p>"}]}
\ No newline at end of file
diff --git a/0.5.1/sitemap.xml b/0.5.1/sitemap.xml
index c7f9c19..ddbc218 100644
--- a/0.5.1/sitemap.xml
+++ b/0.5.1/sitemap.xml
@@ -2,42 +2,42 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/drawbacks/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/schema/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/usage/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/legacy/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/pgstac/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/spec/stac-geoparquet-spec/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/0.5.1/sitemap.xml.gz b/0.5.1/sitemap.xml.gz
index ff0ffba75439d5d6bfa656bfc6c1d004f0296c58..207add5b981efa0ac6084b502afcbb3a5081310a 100644
GIT binary patch
literal 278
zcmV+x0qOo9iwFpS2zq7$|8r?{Wo=<_E_iKh0M(SiPQ)M(hVT0n4fjHA)Tn9Mn@^w*
zFiS^DmKK(w)_r?vyD^PtJunyGpPxS;A<!JZ23ve$AmhfC-SLVQI4akfQMc^<?X`Sj
zM|o(bU<qT9%p<j|kI}yfG0!t66LA1f+yyO2^=U!kCW?wbGkGXdEdkqXt;RYfmlR4d
z6hW0!G?utFv7b)dxJ}gPJcP@HQCNsbQ8Z%Nxeo%0gX?5n)%RufSk@28a?^G;)x)Uq
zBj81LVC=OKJ7psjKZt(?+Xa|Upw5&R_I8VTF;4^<^Fju11ccz`{ID%*P}lspy<VD`
cy+)7fcduN<e?P`Lq;Fq-13>=fGA0H909${5<p2Nx

literal 278
zcmV+x0qOo9iwFn+w|Hg(|8r?{Wo=<_E_iKh0M(R1Zo?oDMfW*{#opjpQPoy9+4ThN
z0Va5gQL!NmuAAEz+fik8-7c67@aOaO5Q66XHQ3@40~t5A?8qxt;HX?@M%}Wv*O&5<
zo#mmKf+dVaGLO`<K1Tm6#5~WOOvC{^aTl~8)u#oCyC^FD#N?q!wFGRlwHoV`Tv8~-
zPy|&@(OBZz#D2PP<2F&F^AN5RMqwc$MbU_5=ROE14z81RRiDc0p{!5Ia@Te?)x)Uq
zJ>XS#VC=0CJ7psjKZt(?+Xa{}pgt%s?ClowVx9;z=7kL22nfN=`C(hspl<nbd%ZL>
cdyO8|?_Rlz|9*^hNZ-Ev1}tNkza|C%09hZ0G5`Po

diff --git a/0.5.1/usage/index.html b/0.5.1/usage/index.html
index d1a9546..2b0a3f1 100644
--- a/0.5.1/usage/index.html
+++ b/0.5.1/usage/index.html
@@ -733,19 +733,33 @@
 
 
 <h1 id="usage">Usage<a class="headerlink" href="#usage" title="Permanent link">&para;</a></h1>
-<p>Except for the <a href="../api/legacy/">legacy API</a>, <a href="https://arrow.apache.org/">Apache Arrow</a> is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p>
+<p><a href="https://arrow.apache.org/">Apache Arrow</a> is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p>
 <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>
 <h2 id="dictjson-arrow-conversion"><code>dict</code>/JSON - Arrow conversion<a class="headerlink" href="#dictjson-arrow-conversion" title="Permanent link">&para;</a></h2>
 <h3 id="convert-dicts-to-arrow">Convert <code>dict</code>s to Arrow<a class="headerlink" href="#convert-dicts-to-arrow" title="Permanent link">&para;</a></h3>
 <p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow"><code>parse_stac_items_to_arrow</code></a> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <a class="autorefs autorefs-external" href="https://pystac.readthedocs.io/en/stable/api/pystac.html#pystac.Item"><code>pystac.Item</code></a> objects.</p>
+<p>For example:</p>
+<div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
+<span class="kn">import</span> <span class="nn">pystac</span>
+
+<span class="kn">import</span> <span class="nn">stac_geoparquet</span>
+
+<span class="n">item</span> <span class="o">=</span> <span class="n">pystac</span><span class="o">.</span><span class="n">read_file</span><span class="p">(</span>
+    <span class="s2">&quot;https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20230112T104411_R008_T29NPE_20230113T053333&quot;</span>
+<span class="p">)</span>
+<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">pystac</span><span class="o">.</span><span class="n">Item</span><span class="p">)</span>
+
+<span class="n">record_batch_reader</span> <span class="o">=</span> <span class="n">stac_geoparquet</span><span class="o">.</span><span class="n">arrow</span><span class="o">.</span><span class="n">parse_stac_items_to_arrow</span><span class="p">([</span><span class="n">item</span><span class="p">])</span>
+<span class="n">table</span> <span class="o">=</span> <span class="n">record_batch_reader</span><span class="o">.</span><span class="n">read_all</span><span class="p">()</span>
+</code></pre></div>
 <h3 id="convert-json-to-arrow">Convert JSON to Arrow<a class="headerlink" href="#convert-json-to-arrow" title="Permanent link">&para;</a></h3>
 <p><a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow"><code>parse_stac_ndjson_to_arrow</code></a> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>
 <h3 id="convert-arrow-to-dicts">Convert Arrow to <code>dict</code>s<a class="headerlink" href="#convert-arrow-to-dicts" title="Permanent link">&para;</a></h3>
 <p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_items"><code>stac_table_to_items</code></a> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
 <h3 id="convert-arrow-to-json">Convert Arrow to JSON<a class="headerlink" href="#convert-arrow-to-json" title="Permanent link">&para;</a></h3>
-<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson"><code>stac_table_to_ndjson</code></a> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
+<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson"><code>stac_table_to_ndjson</code></a> to convert a table or stream of Arrow record batches of STAC data to a newline-delimited JSON file. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
 <h2 id="parquet">Parquet<a class="headerlink" href="#parquet" title="Permanent link">&para;</a></h2>
-<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.to_parquet"><code>to_parquet</code></a> to write STAC Arrow data in memory. This is a special function to ensure that <a href="https://geoparquet.org/">GeoParquet</a> 1.0 or 1.1 metadata is written to the Parquet file.</p>
+<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.to_parquet"><code>to_parquet</code></a> to write STAC Arrow data from memory to a path or file-like object. This is a special function to ensure that <a href="https://geoparquet.org/">GeoParquet</a> 1.0 or 1.1 metadata is written to the Parquet file.</p>
 <p><a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet"><code>parse_stac_ndjson_to_parquet</code></a> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p>
 <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow.parquet.read_table"><code>pyarrow.parquet.read_table</code></a> or <a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile"><code>pyarrow.parquet.ParquetFile</code></a> directly to read the STAC GeoParquet data back into Arrow.</p>
 <h2 id="delta-lake">Delta Lake<a class="headerlink" href="#delta-lake" title="Permanent link">&para;</a></h2>
diff --git a/latest/api/arrow/index.html b/latest/api/arrow/index.html
index 17bb1c8..304788a 100644
--- a/latest/api/arrow/index.html
+++ b/latest/api/arrow/index.html
@@ -916,7 +916,7 @@ <h3 id="stac_geoparquet.arrow.parse_stac_items_to_arrow" class="doc doc-heading"
     <span class="o">*</span><span class="p">,</span>
     <span class="n">chunk_size</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">=</span> <span class="mi">8192</span><span class="p">,</span>
     <span class="n">schema</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Schema" href="https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema">Schema</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow._schema.models.InferredSchema">InferredSchema</span></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
-<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></span><span class="p">]</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
@@ -968,10 +968,10 @@ <h3 id="stac_geoparquet.arrow.parse_stac_items_to_arrow" class="doc doc-heading"
 <p><span class="doc-section-title">Returns:</span></p>
     <ul>
         <li class="doc-section-item field-body">
-              <code><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a>[<a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a>]</code>
+              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></code>
           –
           <div class="doc-md-description">
-            <p>an iterable of pyarrow RecordBatches with the STAC-GeoParquet representation of items.</p>
+            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p>
           </div>
         </li>
     </ul>
@@ -994,7 +994,7 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_arrow" class="doc doc-heading
     <span class="n">chunk_size</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">=</span> <span class="n">DEFAULT_JSON_CHUNK_SIZE</span><span class="p">,</span>
     <span class="n">schema</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Schema" href="https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema">Schema</a></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">limit</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
-<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterator" href="https://docs.python.org/3/library/typing.html#typing.Iterator">Iterator</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></span><span class="p">]</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
@@ -1054,13 +1054,13 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_arrow" class="doc doc-heading
     </ul>
 
 
-<p><span class="doc-section-title">Yields:</span></p>
+<p><span class="doc-section-title">Returns:</span></p>
     <ul>
         <li class="doc-section-item field-body">
-              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatch" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch">RecordBatch</a></code>
+              <code><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></code>
           –
           <div class="doc-md-description">
-            <p>Arrow RecordBatch with a single chunk of Item data.</p>
+            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p>
           </div>
         </li>
     </ul>
@@ -1255,6 +1255,8 @@ <h3 id="stac_geoparquet.arrow.parse_stac_ndjson_to_parquet" class="doc doc-headi
           </div>
         </li>
     </ul>
+      <p>All other keyword args are passed on to
+<a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter"><code>pyarrow.parquet.ParquetWriter</code></a>.</p>
 
     </div>
 
@@ -1268,12 +1270,43 @@ <h3 id="stac_geoparquet.arrow.stac_table_to_items" class="doc doc-heading">
 
 
 <a href="#stac_geoparquet.arrow.stac_table_to_items" class="headerlink" title="Permanent link">&para;</a></h3>
-<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_items</span><span class="p">(</span><span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a></span><span class="p">]</span>
+<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_items</span><span class="p">(</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a></span><span class="p">]</span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
 
-      <p>Convert a STAC Table to a generator of STAC Item <code>dict</code>s</p>
+      <p>Convert STAC Arrow to a generator of STAC Item <code>dict</code>s.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+          <b><code>table</code></b>
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
+          –
+          <div class="doc-md-description">
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
+          </div>
+        </li>
+    </ul>
+
+
+<p><span class="doc-section-title">Yields:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+              <code><a class="autorefs autorefs-external" title="typing.Iterable" href="https://docs.python.org/3/library/typing.html#typing.Iterable">Iterable</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#dict">dict</a>]</code>
+          –
+          <div class="doc-md-description">
+            <p>A STAC <code>dict</code> for each input row.</p>
+          </div>
+        </li>
+    </ul>
 
     </div>
 
@@ -1287,12 +1320,40 @@ <h3 id="stac_geoparquet.arrow.stac_table_to_ndjson" class="doc doc-heading">
 
 
 <a href="#stac_geoparquet.arrow.stac_table_to_ndjson" class="headerlink" title="Permanent link">&para;</a></h3>
-<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_ndjson</span><span class="p">(</span><span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">,</span> <span class="n">dest</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a></span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span>
+<div class="doc-signature codehilite"><pre><span></span><code><span class="nf">stac_table_to_ndjson</span><span class="p">(</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+    <span class="n">dest</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a></span><span class="p">[</span><span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a></span><span class="p">],</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span>
 </code></pre></div>
 
     <div class="doc doc-contents ">
 
-      <p>Write a STAC Table to a newline-delimited JSON file.</p>
+      <p>Write STAC Arrow to a newline-delimited JSON file.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+          <b><code>table</code></b>
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
+          –
+          <div class="doc-md-description">
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
+          </div>
+        </li>
+        <li class="doc-section-item field-body">
+          <b><code>dest</code></b>
+              (<code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a> | <a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a> | <a class="autorefs autorefs-external" title="os.PathLike" href="https://docs.python.org/3/library/os.html#os.PathLike">PathLike</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#bytes">bytes</a>]</code>)
+          –
+          <div class="doc-md-description">
+            <p>The destination where newline-delimited JSON should be written.</p>
+          </div>
+        </li>
+    </ul>
 
     </div>
 
@@ -1307,8 +1368,8 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
 
 <a href="#stac_geoparquet.arrow.to_parquet" class="headerlink" title="Permanent link">&para;</a></h3>
 <div class="doc-signature codehilite"><pre><span></span><code><span class="nf">to_parquet</span><span class="p">(</span>
-    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span><span class="p">,</span>
-    <span class="n">where</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></span><span class="p">,</span>
+    <span class="n">table</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a></span> <span class="o">|</span> <span class="n"><span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></span><span class="p">,</span>
+    <span class="n">output_path</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></span> <span class="o">|</span> <span class="n"><a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></span><span class="p">,</span>
     <span class="o">*</span><span class="p">,</span>
     <span class="n">schema_version</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-internal" title="stac_geoparquet.arrow._constants.SUPPORTED_PARQUET_SCHEMA_VERSIONS" href="#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS">SUPPORTED_PARQUET_SCHEMA_VERSIONS</a></span> <span class="o">=</span> <span class="n">DEFAULT_PARQUET_SCHEMA_VERSION</span><span class="p">,</span>
     <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n"><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></span>
@@ -1325,15 +1386,19 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
     <ul>
         <li class="doc-section-item field-body">
           <b><code>table</code></b>
-              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a></code>)
+              (<code><a class="autorefs autorefs-external" title="pyarrow.Table" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table">Table</a> | <a class="autorefs autorefs-external" title="pyarrow.RecordBatchReader" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader">RecordBatchReader</a> | <span title="stac_geoparquet.arrow.types.ArrowStreamExportable">ArrowStreamExportable</span></code>)
           –
           <div class="doc-md-description">
-            <p>The table to write to Parquet</p>
+            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow
+RecordBatchReader, or any other Arrow stream object exposed through the
+<a href="https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html">Arrow PyCapsule
+Interface</a>.
+A RecordBatchReader or stream object will not be materialized in memory.</p>
           </div>
         </li>
         <li class="doc-section-item field-body">
-          <b><code>where</code></b>
-              (<code><a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a></code>)
+          <b><code>output_path</code></b>
+              (<code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a> | <a class="autorefs autorefs-external" title="pathlib.Path" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path">Path</a></code>)
           –
           <div class="doc-md-description">
             <p>The destination for saving.</p>
@@ -1354,6 +1419,8 @@ <h3 id="stac_geoparquet.arrow.to_parquet" class="doc doc-heading">
           </div>
         </li>
     </ul>
+      <p>All other keyword args are passed on to
+<a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter"><code>pyarrow.parquet.ParquetWriter</code></a>.</p>
 
     </div>
 
diff --git a/latest/search/search_index.json b/latest/search/search_index.json
index 582ab72..e02f440 100644
--- a/latest/search/search_index.json
+++ b/latest/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"STAC-geoparquet","text":"<p>Convert STAC items between JSON, GeoParquet, pgstac, and Delta Lake.</p>"},{"location":"#purpose","title":"Purpose","text":"<p>The STAC spec defines a JSON-based schema. But it can be hard to manage and search through many millions of STAC items in JSON format. For one, JSON is very large on disk. And you need to parse the entire JSON data into memory to extract just a small piece of information, say the <code>datetime</code> and one <code>asset</code> of an Item.</p> <p>GeoParquet can be a good complement to JSON for many bulk-access and analytic use cases. While STAC Items are commonly distributed as individual JSON files on object storage or through a STAC API, STAC GeoParquet allows users to access a large number of STAC items in bulk without making repeated HTTP requests.</p> <p>For analytic questions like \"find the items in the Sentinel-2 collection in June 2024 over New York City with cloud cover of less than 20%\" it can be much, much faster to find the relevant data from a GeoParquet source than from JSON, because GeoParquet needs to load only the relevant columns for that query, not the full data.</p> <p>See the STAC-GeoParquet specification for details on the exact schema of the written Parquet files.</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Documentation website</p>"},{"location":"drawbacks/","title":"Drawbacks","text":"<p>Trying to represent STAC data in GeoParquet has some drawbacks.</p>"},{"location":"drawbacks/#unable-to-represent-undefined-values","title":"Unable to represent undefined values","text":"<p>Parquet is unable to represent the difference between undefined and null, and so is unable to perfectly round-trip STAC data with undefined values.</p> <p>In JSON a value can have one of three states: defined, undefined, or null. The <code>\"b\"</code> key in the next three examples illustrates this:</p> <p>Defined:</p> <pre><code>{\n  \"a\": 1,\n  \"b\": \"foo\"\n}\n</code></pre> <p>Undefined:</p> <pre><code>{\n  \"a\": 2\n}\n</code></pre> <p>Null:</p> <pre><code>{\n  \"a\": 3,\n  \"b\": null\n}\n</code></pre> <p>Because Parquet is a columnar format, it is only able to represent undefined at the column level. So if those three JSON items above were converted to Parquet, the column <code>\"b\"</code> would exist because it exists in the first and third item, and the second item would have <code>\"b\"</code> inferred as <code>null</code>:</p> a b 1 \"foo\" 2 null 3 null <p>Then when the second item is converted back to JSON, it will be returned as</p> <pre><code>{\n  \"a\": 2\n  \"b\": null\n}\n</code></pre> <p>which is not strictly equal to the input.</p>"},{"location":"drawbacks/#schema-difficulties","title":"Schema difficulties","text":"<p>JSON is schemaless while Parquet requires a strict schema, and it can be very difficult to unite these two systems. This is such an important consideration that we have a documentation page just to discuss this point.</p>"},{"location":"schema/","title":"Schema considerations","text":"<p>A STAC Item is a JSON object to describe an external geospatial dataset. The STAC specification defines a common core, plus a variety of extensions. Additionally, STAC Items may include custom extensions outside the common ones. Crucially, the majority of the specified fields in the core spec and extensions define optional keys. Those keys often differ across STAC collections and may even differ within a single collection across items.</p> <p>STAC's flexibility is a blessing and a curse. The flexibility of schemaless JSON allows for very easy writing as each object can be dumped separately to JSON. Every item is allowed to have a different schema. And newer items are free to have a different schema than older items in the same collection. But this write-time flexibility makes it harder to read as there are no guarantees (outside STAC's few required fields) about what fields exist.</p> <p>Parquet is the complete opposite of JSON. Parquet has a strict schema that must be known before writing can start. This puts the burden of work onto the writer instead of the reader. Reading Parquet is very efficient because the file's metadata defines the exact schema of every record. This also enables use cases like reading specific columns that would not be possible without a strict schema.</p> <p>This conversion from schemaless to strict-schema is the difficult part of converting STAC from JSON to GeoParquet, especially for large input datasets like STAC that are often larger than memory.</p>"},{"location":"schema/#full-scan-over-input-data","title":"Full scan over input data","text":"<p>The most foolproof way to convert STAC JSON to GeoParquet is to perform a full scan over input data. This is done automatically by <code>parse_stac_ndjson_to_arrow</code> when a schema is not provided.</p> <p>This is time consuming as it requires two full passes over the input data: once to infer a common schema and again to actually write to Parquet (though items are never fully held in memory, allowing this process to scale).</p>"},{"location":"schema/#user-provided-schema","title":"User-provided schema","text":"<p>Alternatively, the user can pass in an Arrow schema themselves using the <code>schema</code> parameter of <code>parse_stac_ndjson_to_arrow</code>. This <code>schema</code> must match the on-disk schema of the the STAC JSON data.</p>"},{"location":"schema/#multiple-schemas-per-collection","title":"Multiple schemas per collection","text":"<p>It is also possible to write multiple Parquet files with STAC data where each Parquet file may have a different schema. This simplifies the conversion and writing process but makes reading and using the Parquet data harder.</p>"},{"location":"schema/#merging-data-with-schema-mismatch","title":"Merging data with schema mismatch","text":"<p>If you've created STAC GeoParquet data where the schema has updated, you can use <code>pyarrow.concat_tables</code> with <code>promote_options=\"permissive\"</code> to combine multiple STAC GeoParquet files.</p> <pre><code>import pyarrow as pa\nimport pyarrow.parquet as pq\n\ntable_1 = pq.read_table(\"stac1.parquet\")\ntable_2 = pq.read_table(\"stac2.parquet\")\ncombined_table = pa.concat_tables([table1, table2], promote_options=\"permissive\")\n</code></pre>"},{"location":"schema/#future-work","title":"Future work","text":"<p>Schema operations is an area where future work can improve reliability and ease of use of STAC GeoParquet.</p> <p>It's possible that in the future we could automatically infer an Arrow schema from the STAC specification's published JSON Schema files. If you're interested in this, open an issue and discuss.</p>"},{"location":"usage/","title":"Usage","text":"<p>Except for the legacy API, Apache Arrow is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p> <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>"},{"location":"usage/#dictjson-arrow-conversion","title":"<code>dict</code>/JSON - Arrow conversion","text":""},{"location":"usage/#convert-dicts-to-arrow","title":"Convert <code>dict</code>s to Arrow","text":"<p>Use <code>parse_stac_items_to_arrow</code> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <code>pystac.Item</code> objects.</p>"},{"location":"usage/#convert-json-to-arrow","title":"Convert JSON to Arrow","text":"<p><code>parse_stac_ndjson_to_arrow</code> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>"},{"location":"usage/#convert-arrow-to-dicts","title":"Convert Arrow to <code>dict</code>s","text":"<p>Use <code>stac_table_to_items</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#convert-arrow-to-json","title":"Convert Arrow to JSON","text":"<p>Use <code>stac_table_to_ndjson</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#parquet","title":"Parquet","text":"<p>Use <code>to_parquet</code> to write STAC Arrow data in memory. This is a special function to ensure that GeoParquet 1.0 or 1.1 metadata is written to the Parquet file.</p> <p><code>parse_stac_ndjson_to_parquet</code> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p> <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <code>pyarrow.parquet.read_table</code> or <code>pyarrow.parquet.ParquetFile</code> directly to read the STAC GeoParquet data back into Arrow.</p>"},{"location":"usage/#delta-lake","title":"Delta Lake","text":"<p>Use <code>parse_stac_ndjson_to_delta_lake</code> to read (newline-delimited) JSON on disk and write out to a Delta Lake table.</p> <p>No special API is required for reading a STAC Delta Lake table back into Arrow. You can use the <code>DeltaTable</code> class directly to read the data back into Arrow.</p> <p>Important</p> <p>Arrow has a null data type, where every value in the column is always null, but Delta Lake does not. This means that for any column inferred to have a <code>null</code> data type, writing to Delta Lake will error with <pre><code>_internal.SchemaMismatchError: Invalid data type for Delta Lake: Null\n</code></pre></p> <p>This is a problem because if all items in a STAC Collection have a <code>null</code> JSON key, it gets inferred as an Arrow <code>null</code> type. For example, in the <code>3dep-lidar-copc</code> collection in the tests, it has <code>start_datetime</code> and <code>end_datetime</code> fields, and so according to the spec, <code>datetime</code> is always <code>null</code>. This column would need to be casted to a timestamp type before being written to Delta Lake.</p> <p>This means we cannot write this collection to Delta Lake solely with automatic schema inference.</p> <p>In such cases, users may need to manually update the inferred schema to cast any <code>null</code> type to another Delta Lake-compatible type.</p>"},{"location":"api/arrow/","title":"<code>stac_geoparquet.arrow</code>","text":"<p>Arrow-based format conversions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow","title":"stac_geoparquet.arrow","text":""},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_JSON_CHUNK_SIZE","title":"DEFAULT_JSON_CHUNK_SIZE  <code>module-attribute</code>","text":"<pre><code>DEFAULT_JSON_CHUNK_SIZE = 65536\n</code></pre> <p>The default chunk size to use for reading JSON into memory.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_PARQUET_SCHEMA_VERSION","title":"DEFAULT_PARQUET_SCHEMA_VERSION  <code>module-attribute</code>","text":"<pre><code>DEFAULT_PARQUET_SCHEMA_VERSION: SUPPORTED_PARQUET_SCHEMA_VERSIONS = '1.1.0'\n</code></pre> <p>The default GeoParquet schema version written to file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS","title":"SUPPORTED_PARQUET_SCHEMA_VERSIONS  <code>module-attribute</code>","text":"<pre><code>SUPPORTED_PARQUET_SCHEMA_VERSIONS = Literal['1.0.0', '1.1.0']\n</code></pre> <p>A Literal type with the supported GeoParquet schema versions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow","title":"parse_stac_items_to_arrow","text":"<pre><code>parse_stac_items_to_arrow(\n    items: Iterable[Item | dict[str, Any]],\n    *,\n    chunk_size: int = 8192,\n    schema: Schema | InferredSchema | None = None\n) -&gt; Iterable[RecordBatch]\n</code></pre> <p>Parse a collection of STAC Items to an iterable of <code>pyarrow.RecordBatch</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the Table, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Iterable[Item | dict[str, Any]]</code>)           \u2013            <p>the STAC Items to convert</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>8192</code> )           \u2013            <p>The chunk size to use for Arrow record batches. This only takes effect if <code>schema</code> is not None. When <code>schema</code> is None, the input will be parsed into a single contiguous record batch. Defaults to 8192.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema of the input data. If provided, can improve memory use; otherwise all items need to be parsed into a single array for schema inference. Defaults to None.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Iterable[RecordBatch]</code>           \u2013            <p>an iterable of pyarrow RecordBatches with the STAC-GeoParquet representation of items.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow","title":"parse_stac_ndjson_to_arrow","text":"<pre><code>parse_stac_ndjson_to_arrow(\n    path: str | Path | Iterable[str | Path],\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None\n) -&gt; Iterator[RecordBatch]\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to a generator of Arrow RecordBatches.</p> <p>Each RecordBatch in the returned iterator is guaranteed to have an identical schema, and can be used to write to one or more Parquet files.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON Items to use for schema inference</p> </li> </ul> <p>Yields:</p> <ul> <li> <code>RecordBatch</code>           \u2013            <p>Arrow RecordBatch with a single chunk of Item data.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_delta_lake","title":"parse_stac_ndjson_to_delta_lake","text":"<pre><code>parse_stac_ndjson_to_delta_lake(\n    input_path: str | Path | Iterable[str | Path],\n    table_or_uri: str | Path | DeltaTable,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to Delta Lake</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>table_or_uri</code>               (<code>str | Path | DeltaTable</code>)           \u2013            <p>A path to the output Delta Lake table</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size to use for reading JSON into memory. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>, default:                   <code>None</code> )           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>, default:                   <code>DEFAULT_PARQUET_SCHEMA_VERSION</code> )           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet","title":"parse_stac_ndjson_to_parquet","text":"<pre><code>parse_stac_ndjson_to_parquet(\n    input_path: str | Path | Iterable[str | Path],\n    output_path: str | Path,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | InferredSchema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to GeoParquet</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>A path to the output Parquet file.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>)           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>)           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_items","title":"stac_table_to_items","text":"<pre><code>stac_table_to_items(table: Table) -&gt; Iterable[dict]\n</code></pre> <p>Convert a STAC Table to a generator of STAC Item <code>dict</code>s</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson","title":"stac_table_to_ndjson","text":"<pre><code>stac_table_to_ndjson(table: Table, dest: str | Path | PathLike[bytes]) -&gt; None\n</code></pre> <p>Write a STAC Table to a newline-delimited JSON file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.to_parquet","title":"to_parquet","text":"<pre><code>to_parquet(\n    table: Table,\n    where: Any,\n    *,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Write an Arrow table with STAC data to GeoParquet</p> <p>This writes metadata compliant with either GeoParquet 1.0 or 1.1.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table</code>)           \u2013            <p>The table to write to Parquet</p> </li> <li> <code>where</code>               (<code>Any</code>)           \u2013            <p>The destination for saving.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/legacy/","title":"Direct GeoPandas conversion (Legacy)","text":"<p>The API listed here was the initial non-Arrow-based STAC-GeoParquet implementation, converting between JSON and GeoPandas directly. For large collections of STAC items, using the new Arrow-based functionality (under the <code>stac_geoparquet.arrow</code> namespace) will be more performant.</p> <p>Note that <code>stac_geoparquet</code> lifts the keys in the item <code>properties</code> up to the top level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <pre><code>&gt;&gt;&gt; import requests\n&gt;&gt;&gt; import stac_geoparquet.arrow\n&gt;&gt;&gt; import pyarrow.parquet\n&gt;&gt;&gt; import pyarrow as pa\n\n&gt;&gt;&gt; items = requests.get(\n...     \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items\"\n... ).json()[\"features\"]\n&gt;&gt;&gt; table = pa.Table.from_batches(stac_geoparquet.arrow.parse_stac_items_to_arrow(items))\n&gt;&gt;&gt; stac_geoparquet.arrow.to_parquet(table, \"items.parquet\")\n&gt;&gt;&gt; table2 = pyarrow.parquet.read_table(\"items.parquet\")\n&gt;&gt;&gt; items2 = list(stac_geoparquet.arrow.stac_table_to_items(table2))\n</code></pre>"},{"location":"api/legacy/#stac_geoparquet.to_geodataframe","title":"stac_geoparquet.to_geodataframe","text":"<pre><code>to_geodataframe(\n    items: Sequence[dict[str, Any]],\n    add_self_link: bool = False,\n    dtype_backend: DTYPE_BACKEND | None = None,\n    datetime_precision: str = \"ns\",\n) -&gt; GeoDataFrame\n</code></pre> <p>Convert a sequence of STAC items to a <code>geopandas.GeoDataFrame</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Sequence[dict[str, Any]]</code>)           \u2013            <p>A sequence of STAC items.</p> </li> <li> <code>add_self_link</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>bool, default False Add the absolute link (if available) to the source STAC Item as a separate column named \"self_link\"</p> </li> <li> <code>dtype_backend</code>               (<code>DTYPE_BACKEND | None</code>, default:                   <code>None</code> )           \u2013            <p><code>{'pyarrow', 'numpy_nullable'}</code>, optional The dtype backend to use for storing arrays.</p> <p>By default, this will use 'numpy_nullable' and emit a FutureWarning that the default will change to 'pyarrow' in the next release.</p> <p>Set to 'numpy_nullable' to silence the warning and accept the old behavior.</p> <p>Set to 'pyarrow' to silence the warning and accept the new behavior.</p> <p>There are some difference in the output as well: with <code>dtype_backend=\"pyarrow\"</code>, struct-like fields will explicitly contain null values for fields that appear in only some of the records. For example, given an <code>assets</code> like::</p> <pre><code>{\n    \"a\": {\n        \"href\": \"a.tif\",\n    },\n    \"b\": {\n        \"href\": \"b.tif\",\n        \"title\": \"B\",\n    }\n}\n</code></pre> <p>The <code>assets</code> field of the output for the first row with <code>dtype_backend=\"numpy_nullable\"</code> will be a Python dictionary with just <code>{\"href\": \"a.tiff\"}</code>.</p> <p>With <code>dtype_backend=\"pyarrow\"</code>, this will be a pyarrow struct with fields <code>{\"href\": \"a.tif\", \"title\", None}</code>. pyarrow will infer that the struct field <code>asset.title</code> is nullable.</p> </li> <li> <code>datetime_precision</code>               (<code>str</code>, default:                   <code>'ns'</code> )           \u2013            <p>str, default \"ns\" The precision to use for the datetime columns. For example, \"us\" is microsecond and \"ns\" is nanosecond.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>GeoDataFrame</code>           \u2013            <p>The converted GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_item_collection","title":"stac_geoparquet.to_item_collection","text":"<pre><code>to_item_collection(df: GeoDataFrame) -&gt; ItemCollection\n</code></pre> <p>Convert a GeoDataFrame of STAC items to a <code>pystac.ItemCollection</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>GeoDataFrame</code>)           \u2013            <p>A GeoDataFrame with a schema similar to that exported by stac-geoparquet.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ItemCollection</code>           \u2013            <p>The converted <code>ItemCollection</code>. There will be one record / feature per row in the in the GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_dict","title":"stac_geoparquet.to_dict","text":"<pre><code>to_dict(record: dict) -&gt; dict\n</code></pre> <p>Create a dictionary representing a STAC item from a row of the GeoDataFrame.</p> <p>Parameters:</p> <ul> <li> <code>record</code>               (<code>dict</code>)           \u2013            <p>dict</p> </li> </ul>"},{"location":"api/pgstac/","title":"pgstac integration","text":"<p><code>stac_geoparquet.pgstac_reader</code> has some helpers for working with items coming from a <code>pgstac.items</code> table. It takes care of</p> <ul> <li>Rehydrating the dehydrated items</li> <li>Partitioning by time</li> <li>Injecting dynamic links and assets from a STAC API</li> </ul>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig","title":"stac_geoparquet.pgstac_reader.CollectionConfig  <code>dataclass</code>","text":"<p>Additional collection-based configuration to inject, matching the dynamic properties from the API.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection","title":"collection  <code>property</code>","text":"<pre><code>collection: Collection\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection_id","title":"collection_id  <code>instance-attribute</code>","text":"<pre><code>collection_id: str\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.partition_frequency","title":"partition_frequency  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>partition_frequency: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.render_config","title":"render_config  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>render_config: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.should_inject_dynamic_properties","title":"should_inject_dynamic_properties  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>should_inject_dynamic_properties: bool = True\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.stac_api","title":"stac_api  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>stac_api: str = 'https://planetarycomputer.microsoft.com/api/stac/v1'\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__init__","title":"__init__","text":"<pre><code>__init__(\n    collection_id: str,\n    partition_frequency: str | None = None,\n    stac_api: str = \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n    should_inject_dynamic_properties: bool = True,\n    render_config: str | None = None,\n) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__post_init__","title":"__post_init__","text":"<pre><code>__post_init__() -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_collection","title":"export_collection","text":"<pre><code>export_collection(\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; list[str | None]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition","title":"export_partition","text":"<pre><code>export_partition(\n    conninfo: str,\n    query: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any] | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition_for_endpoints","title":"export_partition_for_endpoints","text":"<pre><code>export_partition_for_endpoints(\n    endpoints: tuple[datetime, datetime],\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    part_number: int | None = None,\n    total: int | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre> <p>Export results for a pair of endpoints.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.generate_endpoints","title":"generate_endpoints","text":"<pre><code>generate_endpoints(\n    since: datetime | None = None,\n) -&gt; list[tuple[datetime, datetime]]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_assets","title":"inject_assets","text":"<pre><code>inject_assets(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_links","title":"inject_links","text":"<pre><code>inject_links(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.make_pgstac_items","title":"make_pgstac_items","text":"<pre><code>make_pgstac_items(\n    records: list[tuple[str, str, str, datetime, datetime, dict[str, Any]]],\n    base_item: dict[str, Any],\n) -&gt; list[dict[str, Any]]\n</code></pre> <p>Make STAC items out of pgstac records.</p> <p>Parameters:</p> <ul> <li> <code>records</code>               (<code>list[tuple[str, str, str, datetime, datetime, dict[str, Any]]]</code>)           \u2013            <p>list[tuple] The dehydrated records from pgstac.items table.</p> </li> <li> <code>base_item</code>               (<code>dict[str, Any]</code>)           \u2013            <p>dict[str, Any] The base item from the <code>collection_base_item</code> pgstac function for this collection. Used for rehydration</p> </li> </ul>"},{"location":"spec/stac-geoparquet-spec/","title":"STAC GeoParquet Specification","text":""},{"location":"spec/stac-geoparquet-spec/#overview","title":"Overview","text":"<p>This document specifies how to map a set of STAC Items into GeoParquet. It is directly inspired by the STAC GeoParquet library, but aims to provide guidance for anyone putting STAC data into GeoParquet.</p>"},{"location":"spec/stac-geoparquet-spec/#use-cases","title":"Use cases","text":"<ul> <li>Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file.</li> <li>As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON.</li> <li>Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#guidelines","title":"Guidelines","text":"<p>Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet.</p> Field GeoParquet Type Required Details type String Optional This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet stac_extensions List of Strings Required This column is required, but can be empty if no STAC extensions were used id String Required Required, should be unique within each collection geometry Binary (WKB) Required For GeoParquet 1.0 this must be well-known Binary bbox Struct of Floats Required Can be a 4 or 6 value struct, depending on dimension of the data. It must conform to the \"Bounding Box Columns\" definition of GeoParquet 1.1. links List of Link structs Required See Link Struct for more info assets An Assets struct Required See Asset Struct for more info collection String Optional The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata property columns varies - Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field <ul> <li>Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.</li> <li>Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data</li> <li>Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.</li> <li>STAC GeoParquet does not support properties that are named such that they collide with a top-level key.</li> <li>datetime columns should be stored as a native timestamp, not as a string</li> <li>The Collection JSON should be included in the Parquet metadata. See Collection JSON below.</li> <li>Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. <code>proj:geometry</code>) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#link-struct","title":"Link Struct","text":"<p>The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones:</p> Field Name Type Description href string REQUIRED. The actual link in the format of an URL. Relative and absolute links are both allowed. rel string REQUIRED. Relationship between the current document and the linked document. See chapter \"Relation types\" for more information. type string Media type of the referenced entity. title string A human readable title to be used in rendered displays of the link. <p>See Link Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#asset-struct","title":"Asset Struct","text":"<p>The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields:</p> Field Name Type Description href string REQUIRED. URI to the asset object. Relative and absolute URI are both allowed. title string The displayed title for clients and users. description string A description of the Asset providing additional details, such as how it was processed or created. CommonMark 0.29 syntax MAY be used for rich text representation. type string Media type of the asset. See the common media types in the best practice doc for commonly used asset types. roles [string] The semantic roles of the asset, similar to the use of <code>rel</code> in links. <p>Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet</p> <p>To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection.</p> <p>See Asset Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#including-a-stac-collection-json-in-a-stac-geoparquet-collection","title":"Including a STAC Collection JSON in a STAC Geoparquet Collection","text":"<p>To make a stac-geoparquet file a fully self-contained representation, you can include the Collection JSON in the Parquet metadata. If present in the Parquet file metadata, the key must be <code>stac:collection</code> and the value must be a JSON string with the Collection JSON.</p>"},{"location":"spec/stac-geoparquet-spec/#referencing-a-stac-geoparquet-collections-in-a-stac-collection-json","title":"Referencing a STAC Geoparquet Collections in a STAC Collection JSON","text":"<p>A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an Asset Object at the collection level of the STAC JSON that includes the <code>application/vnd.apache.parquet</code> Media type and <code>collection-mirror</code> Role type to describe the function of the Geoparquet STAC Collection Asset.</p> <p>For example:</p> Field Name Type Value href string s3://example/uri/to/file.parquet title string An example STAC GeoParquet. description string Example description. type string <code>application/vnd.apache.parquet</code> roles [string] [collection-mirror]* <p>*Note the IANA has not approved the new Media type <code>application/vnd.apache.parquet</code> yet, it's been submitted for approval.</p> <p>The description should ideally include details about the spatial partitioning method.</p>"},{"location":"spec/stac-geoparquet-spec/#mapping-to-other-geospatial-data-formats","title":"Mapping to other geospatial data formats","text":"<p>The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"STAC-geoparquet","text":"<p>Convert STAC items between JSON, GeoParquet, pgstac, and Delta Lake.</p>"},{"location":"#purpose","title":"Purpose","text":"<p>The STAC spec defines a JSON-based schema. But it can be hard to manage and search through many millions of STAC items in JSON format. For one, JSON is very large on disk. And you need to parse the entire JSON data into memory to extract just a small piece of information, say the <code>datetime</code> and one <code>asset</code> of an Item.</p> <p>GeoParquet can be a good complement to JSON for many bulk-access and analytic use cases. While STAC Items are commonly distributed as individual JSON files on object storage or through a STAC API, STAC GeoParquet allows users to access a large number of STAC items in bulk without making repeated HTTP requests.</p> <p>For analytic questions like \"find the items in the Sentinel-2 collection in June 2024 over New York City with cloud cover of less than 20%\" it can be much, much faster to find the relevant data from a GeoParquet source than from JSON, because GeoParquet needs to load only the relevant columns for that query, not the full data.</p> <p>See the STAC-GeoParquet specification for details on the exact schema of the written Parquet files.</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Documentation website</p>"},{"location":"drawbacks/","title":"Drawbacks","text":"<p>Trying to represent STAC data in GeoParquet has some drawbacks.</p>"},{"location":"drawbacks/#unable-to-represent-undefined-values","title":"Unable to represent undefined values","text":"<p>Parquet is unable to represent the difference between undefined and null, and so is unable to perfectly round-trip STAC data with undefined values.</p> <p>In JSON a value can have one of three states: defined, undefined, or null. The <code>\"b\"</code> key in the next three examples illustrates this:</p> <p>Defined:</p> <pre><code>{\n  \"a\": 1,\n  \"b\": \"foo\"\n}\n</code></pre> <p>Undefined:</p> <pre><code>{\n  \"a\": 2\n}\n</code></pre> <p>Null:</p> <pre><code>{\n  \"a\": 3,\n  \"b\": null\n}\n</code></pre> <p>Because Parquet is a columnar format, it is only able to represent undefined at the column level. So if those three JSON items above were converted to Parquet, the column <code>\"b\"</code> would exist because it exists in the first and third item, and the second item would have <code>\"b\"</code> inferred as <code>null</code>:</p> a b 1 \"foo\" 2 null 3 null <p>Then when the second item is converted back to JSON, it will be returned as</p> <pre><code>{\n  \"a\": 2\n  \"b\": null\n}\n</code></pre> <p>which is not strictly equal to the input.</p>"},{"location":"drawbacks/#schema-difficulties","title":"Schema difficulties","text":"<p>JSON is schemaless while Parquet requires a strict schema, and it can be very difficult to unite these two systems. This is such an important consideration that we have a documentation page just to discuss this point.</p>"},{"location":"schema/","title":"Schema considerations","text":"<p>A STAC Item is a JSON object to describe an external geospatial dataset. The STAC specification defines a common core, plus a variety of extensions. Additionally, STAC Items may include custom extensions outside the common ones. Crucially, the majority of the specified fields in the core spec and extensions define optional keys. Those keys often differ across STAC collections and may even differ within a single collection across items.</p> <p>STAC's flexibility is a blessing and a curse. The flexibility of schemaless JSON allows for very easy writing as each object can be dumped separately to JSON. Every item is allowed to have a different schema. And newer items are free to have a different schema than older items in the same collection. But this write-time flexibility makes it harder to read as there are no guarantees (outside STAC's few required fields) about what fields exist.</p> <p>Parquet is the complete opposite of JSON. Parquet has a strict schema that must be known before writing can start. This puts the burden of work onto the writer instead of the reader. Reading Parquet is very efficient because the file's metadata defines the exact schema of every record. This also enables use cases like reading specific columns that would not be possible without a strict schema.</p> <p>This conversion from schemaless to strict-schema is the difficult part of converting STAC from JSON to GeoParquet, especially for large input datasets like STAC that are often larger than memory.</p>"},{"location":"schema/#full-scan-over-input-data","title":"Full scan over input data","text":"<p>The most foolproof way to convert STAC JSON to GeoParquet is to perform a full scan over input data. This is done automatically by <code>parse_stac_ndjson_to_arrow</code> when a schema is not provided.</p> <p>This is time consuming as it requires two full passes over the input data: once to infer a common schema and again to actually write to Parquet (though items are never fully held in memory, allowing this process to scale).</p>"},{"location":"schema/#user-provided-schema","title":"User-provided schema","text":"<p>Alternatively, the user can pass in an Arrow schema themselves using the <code>schema</code> parameter of <code>parse_stac_ndjson_to_arrow</code>. This <code>schema</code> must match the on-disk schema of the the STAC JSON data.</p>"},{"location":"schema/#multiple-schemas-per-collection","title":"Multiple schemas per collection","text":"<p>It is also possible to write multiple Parquet files with STAC data where each Parquet file may have a different schema. This simplifies the conversion and writing process but makes reading and using the Parquet data harder.</p>"},{"location":"schema/#merging-data-with-schema-mismatch","title":"Merging data with schema mismatch","text":"<p>If you've created STAC GeoParquet data where the schema has updated, you can use <code>pyarrow.concat_tables</code> with <code>promote_options=\"permissive\"</code> to combine multiple STAC GeoParquet files.</p> <pre><code>import pyarrow as pa\nimport pyarrow.parquet as pq\n\ntable_1 = pq.read_table(\"stac1.parquet\")\ntable_2 = pq.read_table(\"stac2.parquet\")\ncombined_table = pa.concat_tables([table1, table2], promote_options=\"permissive\")\n</code></pre>"},{"location":"schema/#future-work","title":"Future work","text":"<p>Schema operations is an area where future work can improve reliability and ease of use of STAC GeoParquet.</p> <p>It's possible that in the future we could automatically infer an Arrow schema from the STAC specification's published JSON Schema files. If you're interested in this, open an issue and discuss.</p>"},{"location":"usage/","title":"Usage","text":"<p>Apache Arrow is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p> <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>"},{"location":"usage/#dictjson-arrow-conversion","title":"<code>dict</code>/JSON - Arrow conversion","text":""},{"location":"usage/#convert-dicts-to-arrow","title":"Convert <code>dict</code>s to Arrow","text":"<p>Use <code>parse_stac_items_to_arrow</code> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <code>pystac.Item</code> objects.</p> <p>For example:</p> <pre><code>import pyarrow as pa\nimport pystac\n\nimport stac_geoparquet\n\nitem = pystac.read_file(\n    \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20230112T104411_R008_T29NPE_20230113T053333\"\n)\nassert isinstance(item, pystac.Item)\n\nrecord_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow([item])\ntable = record_batch_reader.read_all()\n</code></pre>"},{"location":"usage/#convert-json-to-arrow","title":"Convert JSON to Arrow","text":"<p><code>parse_stac_ndjson_to_arrow</code> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>"},{"location":"usage/#convert-arrow-to-dicts","title":"Convert Arrow to <code>dict</code>s","text":"<p>Use <code>stac_table_to_items</code> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#convert-arrow-to-json","title":"Convert Arrow to JSON","text":"<p>Use <code>stac_table_to_ndjson</code> to convert a table or stream of Arrow record batches of STAC data to a newline-delimited JSON file. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>"},{"location":"usage/#parquet","title":"Parquet","text":"<p>Use <code>to_parquet</code> to write STAC Arrow data from memory to a path or file-like object. This is a special function to ensure that GeoParquet 1.0 or 1.1 metadata is written to the Parquet file.</p> <p><code>parse_stac_ndjson_to_parquet</code> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p> <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <code>pyarrow.parquet.read_table</code> or <code>pyarrow.parquet.ParquetFile</code> directly to read the STAC GeoParquet data back into Arrow.</p>"},{"location":"usage/#delta-lake","title":"Delta Lake","text":"<p>Use <code>parse_stac_ndjson_to_delta_lake</code> to read (newline-delimited) JSON on disk and write out to a Delta Lake table.</p> <p>No special API is required for reading a STAC Delta Lake table back into Arrow. You can use the <code>DeltaTable</code> class directly to read the data back into Arrow.</p> <p>Important</p> <p>Arrow has a null data type, where every value in the column is always null, but Delta Lake does not. This means that for any column inferred to have a <code>null</code> data type, writing to Delta Lake will error with <pre><code>_internal.SchemaMismatchError: Invalid data type for Delta Lake: Null\n</code></pre></p> <p>This is a problem because if all items in a STAC Collection have a <code>null</code> JSON key, it gets inferred as an Arrow <code>null</code> type. For example, in the <code>3dep-lidar-copc</code> collection in the tests, it has <code>start_datetime</code> and <code>end_datetime</code> fields, and so according to the spec, <code>datetime</code> is always <code>null</code>. This column would need to be casted to a timestamp type before being written to Delta Lake.</p> <p>This means we cannot write this collection to Delta Lake solely with automatic schema inference.</p> <p>In such cases, users may need to manually update the inferred schema to cast any <code>null</code> type to another Delta Lake-compatible type.</p>"},{"location":"api/arrow/","title":"<code>stac_geoparquet.arrow</code>","text":"<p>Arrow-based format conversions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow","title":"stac_geoparquet.arrow","text":""},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_JSON_CHUNK_SIZE","title":"DEFAULT_JSON_CHUNK_SIZE  <code>module-attribute</code>","text":"<pre><code>DEFAULT_JSON_CHUNK_SIZE = 65536\n</code></pre> <p>The default chunk size to use for reading JSON into memory.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.DEFAULT_PARQUET_SCHEMA_VERSION","title":"DEFAULT_PARQUET_SCHEMA_VERSION  <code>module-attribute</code>","text":"<pre><code>DEFAULT_PARQUET_SCHEMA_VERSION: SUPPORTED_PARQUET_SCHEMA_VERSIONS = '1.1.0'\n</code></pre> <p>The default GeoParquet schema version written to file.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.SUPPORTED_PARQUET_SCHEMA_VERSIONS","title":"SUPPORTED_PARQUET_SCHEMA_VERSIONS  <code>module-attribute</code>","text":"<pre><code>SUPPORTED_PARQUET_SCHEMA_VERSIONS = Literal['1.0.0', '1.1.0']\n</code></pre> <p>A Literal type with the supported GeoParquet schema versions.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow","title":"parse_stac_items_to_arrow","text":"<pre><code>parse_stac_items_to_arrow(\n    items: Iterable[Item | dict[str, Any]],\n    *,\n    chunk_size: int = 8192,\n    schema: Schema | InferredSchema | None = None\n) -&gt; RecordBatchReader\n</code></pre> <p>Parse a collection of STAC Items to an iterable of <code>pyarrow.RecordBatch</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the Table, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Iterable[Item | dict[str, Any]]</code>)           \u2013            <p>the STAC Items to convert</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>8192</code> )           \u2013            <p>The chunk size to use for Arrow record batches. This only takes effect if <code>schema</code> is not None. When <code>schema</code> is None, the input will be parsed into a single contiguous record batch. Defaults to 8192.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema of the input data. If provided, can improve memory use; otherwise all items need to be parsed into a single array for schema inference. Defaults to None.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>RecordBatchReader</code>           \u2013            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow","title":"parse_stac_ndjson_to_arrow","text":"<pre><code>parse_stac_ndjson_to_arrow(\n    path: str | Path | Iterable[str | Path],\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None\n) -&gt; RecordBatchReader\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to a generator of Arrow RecordBatches.</p> <p>Each RecordBatch in the returned iterator is guaranteed to have an identical schema, and can be used to write to one or more Parquet files.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON Items to use for schema inference</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>RecordBatchReader</code>           \u2013            <p>pyarrow RecordBatchReader with a stream of STAC Arrow RecordBatches.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_delta_lake","title":"parse_stac_ndjson_to_delta_lake","text":"<pre><code>parse_stac_ndjson_to_delta_lake(\n    input_path: str | Path | Iterable[str | Path],\n    table_or_uri: str | Path | DeltaTable,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to Delta Lake</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>table_or_uri</code>               (<code>str | Path | DeltaTable</code>)           \u2013            <p>A path to the output Delta Lake table</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>, default:                   <code>DEFAULT_JSON_CHUNK_SIZE</code> )           \u2013            <p>The chunk size to use for reading JSON into memory. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | None</code>, default:                   <code>None</code> )           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>, default:                   <code>None</code> )           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>, default:                   <code>DEFAULT_PARQUET_SCHEMA_VERSION</code> )           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet","title":"parse_stac_ndjson_to_parquet","text":"<pre><code>parse_stac_ndjson_to_parquet(\n    input_path: str | Path | Iterable[str | Path],\n    output_path: str | Path,\n    *,\n    chunk_size: int = DEFAULT_JSON_CHUNK_SIZE,\n    schema: Schema | InferredSchema | None = None,\n    limit: int | None = None,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Convert one or more newline-delimited JSON STAC files to GeoParquet</p> <p>Parameters:</p> <ul> <li> <code>input_path</code>               (<code>str | Path | Iterable[str | Path]</code>)           \u2013            <p>One or more paths to files with STAC items.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>A path to the output Parquet file.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>chunk_size</code>               (<code>int</code>)           \u2013            <p>The chunk size. Defaults to 65536.</p> </li> <li> <code>schema</code>               (<code>Schema | InferredSchema | None</code>)           \u2013            <p>The schema to represent the input STAC data. Defaults to None, in which case the schema will first be inferred via a full pass over the input data. In this case, there will be two full passes over the input data: one to infer a common schema across all data and another to read the data and iteratively convert to GeoParquet.</p> </li> <li> <code>limit</code>               (<code>int | None</code>)           \u2013            <p>The maximum number of JSON records to convert.</p> </li> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul> <p>All other keyword args are passed on to <code>pyarrow.parquet.ParquetWriter</code>.</p>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_items","title":"stac_table_to_items","text":"<pre><code>stac_table_to_items(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n) -&gt; Iterable[dict]\n</code></pre> <p>Convert STAC Arrow to a generator of STAC Item <code>dict</code>s.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> </ul> <p>Yields:</p> <ul> <li> <code>Iterable[dict]</code>           \u2013            <p>A STAC <code>dict</code> for each input row.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson","title":"stac_table_to_ndjson","text":"<pre><code>stac_table_to_ndjson(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n    dest: str | Path | PathLike[bytes],\n) -&gt; None\n</code></pre> <p>Write STAC Arrow to a newline-delimited JSON file.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> <li> <code>dest</code>               (<code>str | Path | PathLike[bytes]</code>)           \u2013            <p>The destination where newline-delimited JSON should be written.</p> </li> </ul>"},{"location":"api/arrow/#stac_geoparquet.arrow.to_parquet","title":"to_parquet","text":"<pre><code>to_parquet(\n    table: Table | RecordBatchReader | ArrowStreamExportable,\n    output_path: str | Path,\n    *,\n    schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Write an Arrow table with STAC data to GeoParquet</p> <p>This writes metadata compliant with either GeoParquet 1.0 or 1.1.</p> <p>Parameters:</p> <ul> <li> <code>table</code>               (<code>Table | RecordBatchReader | ArrowStreamExportable</code>)           \u2013            <p>STAC in Arrow form. This can be a pyarrow Table, a pyarrow RecordBatchReader, or any other Arrow stream object exposed through the Arrow PyCapsule Interface. A RecordBatchReader or stream object will not be materialized in memory.</p> </li> <li> <code>output_path</code>               (<code>str | Path</code>)           \u2013            <p>The destination for saving.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>schema_version</code>               (<code>SUPPORTED_PARQUET_SCHEMA_VERSIONS</code>)           \u2013            <p>GeoParquet specification version; if not provided will default to latest supported version.</p> </li> </ul> <p>All other keyword args are passed on to <code>pyarrow.parquet.ParquetWriter</code>.</p>"},{"location":"api/legacy/","title":"Direct GeoPandas conversion (Legacy)","text":"<p>The API listed here was the initial non-Arrow-based STAC-GeoParquet implementation, converting between JSON and GeoPandas directly. For large collections of STAC items, using the new Arrow-based functionality (under the <code>stac_geoparquet.arrow</code> namespace) will be more performant.</p> <p>Note that <code>stac_geoparquet</code> lifts the keys in the item <code>properties</code> up to the top level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <pre><code>&gt;&gt;&gt; import requests\n&gt;&gt;&gt; import stac_geoparquet.arrow\n&gt;&gt;&gt; import pyarrow.parquet\n&gt;&gt;&gt; import pyarrow as pa\n\n&gt;&gt;&gt; items = requests.get(\n...     \"https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items\"\n... ).json()[\"features\"]\n&gt;&gt;&gt; table = pa.Table.from_batches(stac_geoparquet.arrow.parse_stac_items_to_arrow(items))\n&gt;&gt;&gt; stac_geoparquet.arrow.to_parquet(table, \"items.parquet\")\n&gt;&gt;&gt; table2 = pyarrow.parquet.read_table(\"items.parquet\")\n&gt;&gt;&gt; items2 = list(stac_geoparquet.arrow.stac_table_to_items(table2))\n</code></pre>"},{"location":"api/legacy/#stac_geoparquet.to_geodataframe","title":"stac_geoparquet.to_geodataframe","text":"<pre><code>to_geodataframe(\n    items: Sequence[dict[str, Any]],\n    add_self_link: bool = False,\n    dtype_backend: DTYPE_BACKEND | None = None,\n    datetime_precision: str = \"ns\",\n) -&gt; GeoDataFrame\n</code></pre> <p>Convert a sequence of STAC items to a <code>geopandas.GeoDataFrame</code>.</p> <p>The objects under <code>properties</code> are moved up to the top-level of the DataFrame, similar to <code>geopandas.GeoDataFrame.from_features</code>.</p> <p>Parameters:</p> <ul> <li> <code>items</code>               (<code>Sequence[dict[str, Any]]</code>)           \u2013            <p>A sequence of STAC items.</p> </li> <li> <code>add_self_link</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>bool, default False Add the absolute link (if available) to the source STAC Item as a separate column named \"self_link\"</p> </li> <li> <code>dtype_backend</code>               (<code>DTYPE_BACKEND | None</code>, default:                   <code>None</code> )           \u2013            <p><code>{'pyarrow', 'numpy_nullable'}</code>, optional The dtype backend to use for storing arrays.</p> <p>By default, this will use 'numpy_nullable' and emit a FutureWarning that the default will change to 'pyarrow' in the next release.</p> <p>Set to 'numpy_nullable' to silence the warning and accept the old behavior.</p> <p>Set to 'pyarrow' to silence the warning and accept the new behavior.</p> <p>There are some difference in the output as well: with <code>dtype_backend=\"pyarrow\"</code>, struct-like fields will explicitly contain null values for fields that appear in only some of the records. For example, given an <code>assets</code> like::</p> <pre><code>{\n    \"a\": {\n        \"href\": \"a.tif\",\n    },\n    \"b\": {\n        \"href\": \"b.tif\",\n        \"title\": \"B\",\n    }\n}\n</code></pre> <p>The <code>assets</code> field of the output for the first row with <code>dtype_backend=\"numpy_nullable\"</code> will be a Python dictionary with just <code>{\"href\": \"a.tiff\"}</code>.</p> <p>With <code>dtype_backend=\"pyarrow\"</code>, this will be a pyarrow struct with fields <code>{\"href\": \"a.tif\", \"title\", None}</code>. pyarrow will infer that the struct field <code>asset.title</code> is nullable.</p> </li> <li> <code>datetime_precision</code>               (<code>str</code>, default:                   <code>'ns'</code> )           \u2013            <p>str, default \"ns\" The precision to use for the datetime columns. For example, \"us\" is microsecond and \"ns\" is nanosecond.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>GeoDataFrame</code>           \u2013            <p>The converted GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_item_collection","title":"stac_geoparquet.to_item_collection","text":"<pre><code>to_item_collection(df: GeoDataFrame) -&gt; ItemCollection\n</code></pre> <p>Convert a GeoDataFrame of STAC items to a <code>pystac.ItemCollection</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>GeoDataFrame</code>)           \u2013            <p>A GeoDataFrame with a schema similar to that exported by stac-geoparquet.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ItemCollection</code>           \u2013            <p>The converted <code>ItemCollection</code>. There will be one record / feature per row in the in the GeoDataFrame.</p> </li> </ul>"},{"location":"api/legacy/#stac_geoparquet.to_dict","title":"stac_geoparquet.to_dict","text":"<pre><code>to_dict(record: dict) -&gt; dict\n</code></pre> <p>Create a dictionary representing a STAC item from a row of the GeoDataFrame.</p> <p>Parameters:</p> <ul> <li> <code>record</code>               (<code>dict</code>)           \u2013            <p>dict</p> </li> </ul>"},{"location":"api/pgstac/","title":"pgstac integration","text":"<p><code>stac_geoparquet.pgstac_reader</code> has some helpers for working with items coming from a <code>pgstac.items</code> table. It takes care of</p> <ul> <li>Rehydrating the dehydrated items</li> <li>Partitioning by time</li> <li>Injecting dynamic links and assets from a STAC API</li> </ul>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig","title":"stac_geoparquet.pgstac_reader.CollectionConfig  <code>dataclass</code>","text":"<p>Additional collection-based configuration to inject, matching the dynamic properties from the API.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection","title":"collection  <code>property</code>","text":"<pre><code>collection: Collection\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.collection_id","title":"collection_id  <code>instance-attribute</code>","text":"<pre><code>collection_id: str\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.partition_frequency","title":"partition_frequency  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>partition_frequency: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.render_config","title":"render_config  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>render_config: str | None = None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.should_inject_dynamic_properties","title":"should_inject_dynamic_properties  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>should_inject_dynamic_properties: bool = True\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.stac_api","title":"stac_api  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>stac_api: str = 'https://planetarycomputer.microsoft.com/api/stac/v1'\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__init__","title":"__init__","text":"<pre><code>__init__(\n    collection_id: str,\n    partition_frequency: str | None = None,\n    stac_api: str = \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n    should_inject_dynamic_properties: bool = True,\n    render_config: str | None = None,\n) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.__post_init__","title":"__post_init__","text":"<pre><code>__post_init__() -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_collection","title":"export_collection","text":"<pre><code>export_collection(\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; list[str | None]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition","title":"export_partition","text":"<pre><code>export_partition(\n    conninfo: str,\n    query: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any] | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.export_partition_for_endpoints","title":"export_partition_for_endpoints","text":"<pre><code>export_partition_for_endpoints(\n    endpoints: tuple[datetime, datetime],\n    conninfo: str,\n    output_protocol: str,\n    output_path: str,\n    storage_options: dict[str, Any],\n    part_number: int | None = None,\n    total: int | None = None,\n    rewrite: bool = False,\n    skip_empty_partitions: bool = False,\n) -&gt; str | None\n</code></pre> <p>Export results for a pair of endpoints.</p>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.generate_endpoints","title":"generate_endpoints","text":"<pre><code>generate_endpoints(\n    since: datetime | None = None,\n) -&gt; list[tuple[datetime, datetime]]\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_assets","title":"inject_assets","text":"<pre><code>inject_assets(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.inject_links","title":"inject_links","text":"<pre><code>inject_links(item: dict[str, Any]) -&gt; None\n</code></pre>"},{"location":"api/pgstac/#stac_geoparquet.pgstac_reader.CollectionConfig.make_pgstac_items","title":"make_pgstac_items","text":"<pre><code>make_pgstac_items(\n    records: list[tuple[str, str, str, datetime, datetime, dict[str, Any]]],\n    base_item: dict[str, Any],\n) -&gt; list[dict[str, Any]]\n</code></pre> <p>Make STAC items out of pgstac records.</p> <p>Parameters:</p> <ul> <li> <code>records</code>               (<code>list[tuple[str, str, str, datetime, datetime, dict[str, Any]]]</code>)           \u2013            <p>list[tuple] The dehydrated records from pgstac.items table.</p> </li> <li> <code>base_item</code>               (<code>dict[str, Any]</code>)           \u2013            <p>dict[str, Any] The base item from the <code>collection_base_item</code> pgstac function for this collection. Used for rehydration</p> </li> </ul>"},{"location":"spec/stac-geoparquet-spec/","title":"STAC GeoParquet Specification","text":""},{"location":"spec/stac-geoparquet-spec/#overview","title":"Overview","text":"<p>This document specifies how to map a set of STAC Items into GeoParquet. It is directly inspired by the STAC GeoParquet library, but aims to provide guidance for anyone putting STAC data into GeoParquet.</p>"},{"location":"spec/stac-geoparquet-spec/#use-cases","title":"Use cases","text":"<ul> <li>Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file.</li> <li>As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON.</li> <li>Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#guidelines","title":"Guidelines","text":"<p>Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet.</p> Field GeoParquet Type Required Details type String Optional This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet stac_extensions List of Strings Required This column is required, but can be empty if no STAC extensions were used id String Required Required, should be unique within each collection geometry Binary (WKB) Required For GeoParquet 1.0 this must be well-known Binary bbox Struct of Floats Required Can be a 4 or 6 value struct, depending on dimension of the data. It must conform to the \"Bounding Box Columns\" definition of GeoParquet 1.1. links List of Link structs Required See Link Struct for more info assets An Assets struct Required See Asset Struct for more info collection String Optional The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata property columns varies - Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field <ul> <li>Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.</li> <li>Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data</li> <li>Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.</li> <li>STAC GeoParquet does not support properties that are named such that they collide with a top-level key.</li> <li>datetime columns should be stored as a native timestamp, not as a string</li> <li>The Collection JSON should be included in the Parquet metadata. See Collection JSON below.</li> <li>Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. <code>proj:geometry</code>) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.</li> </ul>"},{"location":"spec/stac-geoparquet-spec/#link-struct","title":"Link Struct","text":"<p>The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones:</p> Field Name Type Description href string REQUIRED. The actual link in the format of an URL. Relative and absolute links are both allowed. rel string REQUIRED. Relationship between the current document and the linked document. See chapter \"Relation types\" for more information. type string Media type of the referenced entity. title string A human readable title to be used in rendered displays of the link. <p>See Link Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#asset-struct","title":"Asset Struct","text":"<p>The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields:</p> Field Name Type Description href string REQUIRED. URI to the asset object. Relative and absolute URI are both allowed. title string The displayed title for clients and users. description string A description of the Asset providing additional details, such as how it was processed or created. CommonMark 0.29 syntax MAY be used for rich text representation. type string Media type of the asset. See the common media types in the best practice doc for commonly used asset types. roles [string] The semantic roles of the asset, similar to the use of <code>rel</code> in links. <p>Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet</p> <p>To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection.</p> <p>See Asset Object for more.</p>"},{"location":"spec/stac-geoparquet-spec/#including-a-stac-collection-json-in-a-stac-geoparquet-collection","title":"Including a STAC Collection JSON in a STAC Geoparquet Collection","text":"<p>To make a stac-geoparquet file a fully self-contained representation, you can include the Collection JSON in the Parquet metadata. If present in the Parquet file metadata, the key must be <code>stac:collection</code> and the value must be a JSON string with the Collection JSON.</p>"},{"location":"spec/stac-geoparquet-spec/#referencing-a-stac-geoparquet-collections-in-a-stac-collection-json","title":"Referencing a STAC Geoparquet Collections in a STAC Collection JSON","text":"<p>A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an Asset Object at the collection level of the STAC JSON that includes the <code>application/vnd.apache.parquet</code> Media type and <code>collection-mirror</code> Role type to describe the function of the Geoparquet STAC Collection Asset.</p> <p>For example:</p> Field Name Type Value href string s3://example/uri/to/file.parquet title string An example STAC GeoParquet. description string Example description. type string <code>application/vnd.apache.parquet</code> roles [string] [collection-mirror]* <p>*Note the IANA has not approved the new Media type <code>application/vnd.apache.parquet</code> yet, it's been submitted for approval.</p> <p>The description should ideally include details about the spatial partitioning method.</p>"},{"location":"spec/stac-geoparquet-spec/#mapping-to-other-geospatial-data-formats","title":"Mapping to other geospatial data formats","text":"<p>The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that.</p>"}]}
\ No newline at end of file
diff --git a/latest/sitemap.xml b/latest/sitemap.xml
index c7f9c19..ddbc218 100644
--- a/latest/sitemap.xml
+++ b/latest/sitemap.xml
@@ -2,42 +2,42 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/drawbacks/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/schema/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/usage/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/legacy/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/api/pgstac/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://stac-utils.github.io/stac-geoparquet/latest/spec/stac-geoparquet-spec/</loc>
-         <lastmod>2024-06-24</lastmod>
+         <lastmod>2024-06-25</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/latest/sitemap.xml.gz b/latest/sitemap.xml.gz
index ff0ffba75439d5d6bfa656bfc6c1d004f0296c58..207add5b981efa0ac6084b502afcbb3a5081310a 100644
GIT binary patch
literal 278
zcmV+x0qOo9iwFpS2zq7$|8r?{Wo=<_E_iKh0M(SiPQ)M(hVT0n4fjHA)Tn9Mn@^w*
zFiS^DmKK(w)_r?vyD^PtJunyGpPxS;A<!JZ23ve$AmhfC-SLVQI4akfQMc^<?X`Sj
zM|o(bU<qT9%p<j|kI}yfG0!t66LA1f+yyO2^=U!kCW?wbGkGXdEdkqXt;RYfmlR4d
z6hW0!G?utFv7b)dxJ}gPJcP@HQCNsbQ8Z%Nxeo%0gX?5n)%RufSk@28a?^G;)x)Uq
zBj81LVC=OKJ7psjKZt(?+Xa|Upw5&R_I8VTF;4^<^Fju11ccz`{ID%*P}lspy<VD`
cy+)7fcduN<e?P`Lq;Fq-13>=fGA0H909${5<p2Nx

literal 278
zcmV+x0qOo9iwFn+w|Hg(|8r?{Wo=<_E_iKh0M(R1Zo?oDMfW*{#opjpQPoy9+4ThN
z0Va5gQL!NmuAAEz+fik8-7c67@aOaO5Q66XHQ3@40~t5A?8qxt;HX?@M%}Wv*O&5<
zo#mmKf+dVaGLO`<K1Tm6#5~WOOvC{^aTl~8)u#oCyC^FD#N?q!wFGRlwHoV`Tv8~-
zPy|&@(OBZz#D2PP<2F&F^AN5RMqwc$MbU_5=ROE14z81RRiDc0p{!5Ia@Te?)x)Uq
zJ>XS#VC=0CJ7psjKZt(?+Xa{}pgt%s?ClowVx9;z=7kL22nfN=`C(hspl<nbd%ZL>
cdyO8|?_Rlz|9*^hNZ-Ev1}tNkza|C%09hZ0G5`Po

diff --git a/latest/usage/index.html b/latest/usage/index.html
index d1a9546..2b0a3f1 100644
--- a/latest/usage/index.html
+++ b/latest/usage/index.html
@@ -733,19 +733,33 @@
 
 
 <h1 id="usage">Usage<a class="headerlink" href="#usage" title="Permanent link">&para;</a></h1>
-<p>Except for the <a href="../api/legacy/">legacy API</a>, <a href="https://arrow.apache.org/">Apache Arrow</a> is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p>
+<p><a href="https://arrow.apache.org/">Apache Arrow</a> is used as the in-memory interchange format between all formats. While some end-to-end helper functions are provided, the user can go through Arrow objects for maximal flexibility in the conversion process.</p>
 <p>All functionality that goes through Arrow is currently exported via the <code>stac_geoparquet.arrow</code> namespace.</p>
 <h2 id="dictjson-arrow-conversion"><code>dict</code>/JSON - Arrow conversion<a class="headerlink" href="#dictjson-arrow-conversion" title="Permanent link">&para;</a></h2>
 <h3 id="convert-dicts-to-arrow">Convert <code>dict</code>s to Arrow<a class="headerlink" href="#convert-dicts-to-arrow" title="Permanent link">&para;</a></h3>
 <p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_items_to_arrow"><code>parse_stac_items_to_arrow</code></a> to convert STAC items either in memory or on disk to a stream of Arrow record batches. This accepts either an iterable of Python <code>dict</code>s or an iterable of <a class="autorefs autorefs-external" href="https://pystac.readthedocs.io/en/stable/api/pystac.html#pystac.Item"><code>pystac.Item</code></a> objects.</p>
+<p>For example:</p>
+<div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
+<span class="kn">import</span> <span class="nn">pystac</span>
+
+<span class="kn">import</span> <span class="nn">stac_geoparquet</span>
+
+<span class="n">item</span> <span class="o">=</span> <span class="n">pystac</span><span class="o">.</span><span class="n">read_file</span><span class="p">(</span>
+    <span class="s2">&quot;https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20230112T104411_R008_T29NPE_20230113T053333&quot;</span>
+<span class="p">)</span>
+<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">pystac</span><span class="o">.</span><span class="n">Item</span><span class="p">)</span>
+
+<span class="n">record_batch_reader</span> <span class="o">=</span> <span class="n">stac_geoparquet</span><span class="o">.</span><span class="n">arrow</span><span class="o">.</span><span class="n">parse_stac_items_to_arrow</span><span class="p">([</span><span class="n">item</span><span class="p">])</span>
+<span class="n">table</span> <span class="o">=</span> <span class="n">record_batch_reader</span><span class="o">.</span><span class="n">read_all</span><span class="p">()</span>
+</code></pre></div>
 <h3 id="convert-json-to-arrow">Convert JSON to Arrow<a class="headerlink" href="#convert-json-to-arrow" title="Permanent link">&para;</a></h3>
 <p><a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow"><code>parse_stac_ndjson_to_arrow</code></a> is a helper function to take one or more JSON or newline-delimited JSON files on disk, infer the schema from all of them, and convert the data to a stream of Arrow record batches.</p>
 <h3 id="convert-arrow-to-dicts">Convert Arrow to <code>dict</code>s<a class="headerlink" href="#convert-arrow-to-dicts" title="Permanent link">&para;</a></h3>
 <p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_items"><code>stac_table_to_items</code></a> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
 <h3 id="convert-arrow-to-json">Convert Arrow to JSON<a class="headerlink" href="#convert-arrow-to-json" title="Permanent link">&para;</a></h3>
-<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson"><code>stac_table_to_ndjson</code></a> to convert a table or stream of Arrow record batches of STAC data to a generator of Python <code>dict</code>s. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
+<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.stac_table_to_ndjson"><code>stac_table_to_ndjson</code></a> to convert a table or stream of Arrow record batches of STAC data to a newline-delimited JSON file. This accepts either a <code>pyarrow.Table</code> or a <code>pyarrow.RecordBatchReader</code>, which allows conversions of larger-than-memory files in a streaming manner.</p>
 <h2 id="parquet">Parquet<a class="headerlink" href="#parquet" title="Permanent link">&para;</a></h2>
-<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.to_parquet"><code>to_parquet</code></a> to write STAC Arrow data in memory. This is a special function to ensure that <a href="https://geoparquet.org/">GeoParquet</a> 1.0 or 1.1 metadata is written to the Parquet file.</p>
+<p>Use <a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.to_parquet"><code>to_parquet</code></a> to write STAC Arrow data from memory to a path or file-like object. This is a special function to ensure that <a href="https://geoparquet.org/">GeoParquet</a> 1.0 or 1.1 metadata is written to the Parquet file.</p>
 <p><a class="autorefs autorefs-internal" href="../api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_parquet"><code>parse_stac_ndjson_to_parquet</code></a> is a helper that connects reading (newline-delimited) JSON on disk to writing out to a Parquet file.</p>
 <p>No special API is required for reading a STAC GeoParquet file back into Arrow. You can use <a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow.parquet.read_table"><code>pyarrow.parquet.read_table</code></a> or <a class="autorefs autorefs-external" href="https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile"><code>pyarrow.parquet.ParquetFile</code></a> directly to read the STAC GeoParquet data back into Arrow.</p>
 <h2 id="delta-lake">Delta Lake<a class="headerlink" href="#delta-lake" title="Permanent link">&para;</a></h2>