Initial commit.

pgspider · Mar 2, 2021 · 069693f · 069693f
1 parent f832e6a
commit 069693f
Show file tree

Hide file tree

Showing 39 changed files with 6,903 additions and 1 deletion.
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,18 @@
+Copyright (C) 2021 Toshiba Corporation
+
+Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL TOSHIBA CORPORATION BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF TOSHIBA CORPORATION  HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+TOSHIBA CORPORATION SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE adjust GmbH HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
+
+This software is based on "parquet_fdw" by the adjust GmbH licensed under the below:
+
+Copyright (c) 2018-2019, adjust GmbH
+
+Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE adjust GmbH BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE adjust GmbH HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+THE adjust GmbH SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE adjust GmbH HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,48 @@
+MODULE_big = parquet_s3_fdw
+OBJS = parquet_impl.o parquet_fdw.o
+# Add file for S3
+OBJS += parquet_s3_fdw.o parquet_s3_fdw_connection.o parquet_s3_fdw_server_option.o
+
+PGFILEDESC = "parquet_s3_fdw - foreign data wrapper for parquet on S3"
+
+SHLIB_LINK = -lm -lstdc++ -lparquet -larrow
+# Add libraries for S3
+SHLIB_LINK += -laws-cpp-sdk-core -laws-cpp-sdk-s3
+
+EXTENSION = parquet_s3_fdw
+DATA = parquet_s3_fdw--0.1.sql parquet_s3_fdw--0.1--0.2.sql
+
+REGRESS = parquet_fdw import parquet_s3_fdw import_s3 parquet_s3_fdw2
+
+EXTRA_CLEAN = sql/parquet_fdw.sql expected/parquet_fdw.out
+
+PG_CONFIG ?= pg_config
+
+# parquet_impl.cpp requires C++ 11.
+override PG_CXXFLAGS += -std=c++11 -O3
+
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+
+# pass CCFLAGS (when defined) to both C and C++ compilers.
+ifdef CCFLAGS
+	override PG_CXXFLAGS += $(CCFLAGS)
+	override PG_CFLAGS += $(CCFLAGS)
+endif
+
+include $(PGXS)
+
+# XXX: PostgreSQL below 11 does not automatically add -fPIC or equivalent to C++
+# flags when building a shared library, have to do it here explicitely.
+ifeq ($(shell test $(VERSION_NUM) -lt 110000; echo $$?), 0)
+	override CXXFLAGS += $(CFLAGS_SL)
+endif
+
+# PostgreSQL uses link time optimization option which may break compilation
+# (this happens on travis-ci). Redefine COMPILE.cxx.bc without this option.
+COMPILE.cxx.bc = $(CLANG) -xc++ -Wno-ignored-attributes $(BITCODE_CXXFLAGS) $(CPPFLAGS) -emit-llvm -c
+
+# XXX: a hurdle to use common compiler flags when building bytecode from C++
+# files. should be not unnecessary, but src/Makefile.global omits passing those
+# flags for an unnknown reason.
+%.bc : %.cpp
+	$(COMPILE.cxx.bc) $(CXXFLAGS) $(CPPFLAGS)  -o $@ $<
diff --git a/README.md b/README.md
@@ -1 +1,184 @@
-# parquet_s3_fdw
+# Parquet S3 Foreign Data Wrapper for PostgreSQL
+
+This PostgreSQL extension is a Foreign Data Wrapper (FDW) for accessing Parquet file on local file system and [Amazon S3][2].
+This version of parquet_s3_fdw can work for PostgreSQL 13.
+
+Parquet foreign data wrapper supporting S3 access for PostgreSQL.
+
+This code is based on [`parquet_fdw`][1]created by adjust GmbH.
+
+## Installation
+### 1. Install dependent libraries
+`parquet_s3_fdw` requires `libarrow` and `libparquet` installed in your system (requires version 0.15, for previous versions use branch [arrow-0.14](https://github.com/adjust/parquet_fdw/tree/arrow-0.14)). Please refer to [building guide](https://github.com/apache/arrow/blob/master/cpp/README.md).
+
+`AWS SDK for C++ (libaws-cpp-sdk-core libaws-cpp-sdk-s3)` is also required (Confirmed version is 1.8.14).
+
+Attention!  
+We reccomend to build `libarrow`, `libparquet` and `AWS SDK for C++` from the source code. We failed to link if using pre-compiled binaries because gcc version is different between arrow and AWS SDK.
+
+### 2. Build and install parquet_s3_fdw
+```sh
+make install
+```
+or in case when PostgreSQL is installed in a custom location:
+```sh
+make install PG_CONFIG=/path/to/pg_config
+```
+It is possible to pass additional compilation flags through either custom
+`CCFLAGS` or standard `PG_CFLAGS`, `PG_CXXFLAGS`, `PG_CPPFLAGS` variables.
+
+## Usage
+### Load extension
+```sql
+CREATE EXTENSION parquet_s3_fdw;
+```
+
+### Create server
+```sql
+CREATE SERVER parquet_s3_srv FOREIGN DATA WRAPPER parquet_s3_fdw;
+```
+If using [MinIO][3] instead of AWS S3, please use use_minio option for create server.
+```sql
+CREATE SERVER parquet_s3_srv FOREIGN DATA WRAPPER parquet_s3_fdw OPTIONS (use_minio 'true');
+```
+
+### Create user mapping
+You have to specify user name and password if accessing Amazon S3.
+```sql
+CREATE USER MAPPING FOR public SERVER parquet_s3_srv OPTIONS (user 's3user', password 's3password');
+```
+
+### Create foreign table
+Now you should be able to create foreign table from Parquet files. Currently `parquet_s3_fdw` supports the following column [types](https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h) (to be extended shortly):
+
+| Parquet type |  SQL type |
+|--------------|-----------|
+|        INT32 |      INT4 |
+|        INT64 |      INT8 |
+|        FLOAT |    FLOAT4 |
+|       DOUBLE |    FLOAT8 |
+|    TIMESTAMP | TIMESTAMP |
+|       DATE32 |      DATE |
+|       STRING |      TEXT |
+|       BINARY |     BYTEA |
+|         LIST |     ARRAY |
+
+Currently `parquet_s3_fdw` doesn't support structs and nested lists.
+
+Following options are supported:
+* **filename** - space separated list of paths to Parquet files to read. You can specify the path on AWS S3 by starting with `s3://`. The mix of local path and S3 path is not supported;
+* **dirname** - path to directory having Parquet files to read;
+* **sorted** - space separated list of columns that Parquet files are presorted by; that would help postgres to avoid redundant sorting when running query with `ORDER BY` clause or in other cases when having a presorted set is beneficial (Group Aggregate, Merge Join);
+* **use_mmap** - whether memory map operations will be used instead of file read operations (default `false`);
+* **use_threads** - enables `arrow`'s parallel columns decoding/decompression (default `false`).
+
+GUC variables:
+* **parquet_fdw.use_threads** - global switch that allow user to enable or disable threads (default `true`).
+
+Example:
+```sql
+CREATE FOREIGN TABLE userdata (
+    id           int,
+    first_name   text,
+    last_name    text
+)
+SERVER parquet_s3_srv
+OPTIONS (
+    filename 's3://bucket/dir/userdata1.parquet'
+);
+```
+
+### Access foreign table
+```sql
+SELECT * FROM userdata;
+```
+
+## Parallel queries
+`parquet_s3_fdw` also supports [parallel query execution](https://www.postgresql.org/docs/current/parallel-query.html) (not to confuse with multi-threaded decoding feature of `arrow`). It is disabled by default; to enable it run `ANALYZE` command on the table. The reason behind this is that without statistics postgres may end up choosing a terrible parallel plan for certain queries which would be much worse than a serial one (e.g. grouping by a column with large number of distinct values).
+
+## Import
+`parquet_s3_fdw` also supports [`IMPORT FOREIGN SCHEMA`](https://www.postgresql.org/docs/current/sql-importforeignschema.html) command to discover parquet files in the specified directory on filesystem and create foreign tables according to those files. It can be used as follows:
+
+```sql
+IMPORT FOREIGN SCHEMA "/path/to/directory"
+FROM SERVER parquet_s3_srv
+INTO public;
+```
+
+It is important that `remote_schema` here is a path to a local filesystem directory and is double quoted.
+
+Another way to import parquet files into foreign tables is to use `import_parquet_s3` or `import_parquet_s3_explicit`:
+
+```sql
+CREATE FUNCTION import_parquet_s3(
+    tablename   text,
+    schemaname  text,
+    servername  text,
+    userfunc    regproc,
+    args        jsonb,
+    options     jsonb)
+
+CREATE FUNCTION import_parquet_s3_explicit(
+    tablename   text,
+    schemaname  text,
+    servername  text,
+    attnames    text[],
+    atttypes    regtype[],
+    userfunc    regproc,
+    args        jsonb,
+    options     jsonb)
+```
+
+The only difference between `import_parquet_s3` and `import_parquet_s3_explicit` is that the latter allows to specify a set of attributes (columns) to import. `attnames` and `atttypes` here are the attributes names and attributes types arrays respectively (see the example below).
+
+`userfunc` is a user-defined function. It must take a `jsonb` argument and return a text array of filesystem paths to parquet files to be imported. `args` is user-specified jsonb object that is passed to `userfunc` as its argument. A simple implementation of such function and its usage may look like this:
+
+```sql
+CREATE FUNCTION list_parquet_s3_files(args jsonb)
+RETURNS text[] AS
+$$
+BEGIN
+    RETURN array_agg(args->>'dir' || '/' || filename)
+           FROM pg_ls_dir(args->>'dir') AS files(filename)
+           WHERE filename ~~ '%.parquet';
+END
+$$
+LANGUAGE plpgsql;
+
+SELECT import_parquet_s3_explicit(
+    'abc',
+    'public',
+    'parquet_srv',
+    array['one', 'three', 'six'],
+    array['int8', 'text', 'bool']::regtype[],
+    'list_parquet_files',
+    '{"dir": "/path/to/directory"}',
+    '{"sorted": "one"}'
+);
+```
+
+## Features
+- Support SELECT of parquet file on local file system or Amazon S3.
+- Support MinIO access instead of Amazon S3.
+
+## Limitations
+- Modification (INSERT, UPDATE and DELETE) is not supported.
+- Transaction is not supported.
+- Cannot create a single foreign table using parquet files on both file system and Amazon S3.
+- AWS region is hard-coded as "ap-northeast-1". If you want to use another region, you need to modify the source code by changing "AP_NORTHEAST_1" in parquet_s3_fdw_connection.cpp.
+
+## Contributing
+Opening issues and pull requests on GitHub are welcome.
+
+## License
+Copyright (c) 2021, TOSHIBA Corporation  
+Copyright (c) 2018 - 2019, adjust GmbH
+
+Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.
+
+See the [`LICENSE.md`][4] file for full details.
+
+[1]: https://github.com/adjust/parquet_fdw
+[2]: https://aws.amazon.com/s3/
+[3]: https://min.io/
+[4]: LICENSE.md
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,21 @@
+# Sample Parquet data
+
+`example.parquet` schema:
+
+| column |        type |
+|--------|-------------|
+|    one |       INT64 |
+|    two | LIST<INT64> |
+|  three |      STRING |
+|   four |   TIMESTAMP |
+|   five |      DATE32 |
+|    six |        BOOL |
+
+## Generator
+
+Generator script requires `pyarrow` and `pandas` python modules installed. To
+generate Parquet file run:
+
+```
+python generate.py
+```
diff --git a/data/example1.parquet b/data/example1.parquet
diff --git a/data/example2.parquet b/data/example2.parquet
diff --git a/data/generate.py b/data/generate.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import pyarrow.parquet as pq
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from datetime import datetime, date
+
+# row group 1
+df1 = pd.DataFrame({'one': [1, 2, 3],
+                    'two': [[1, 2, 3], [None, 5, 6], [7, 8, 9]],
+                    'three': ['foo', 'bar', 'baz'],
+                    'four': [datetime(2018, 1, 1),
+                             datetime(2018, 1, 2),
+                             datetime(2018, 1, 3)],
+                    'five': [date(2018, 1, 1),
+                             date(2018, 1, 2),
+                             date(2018, 1, 3)],
+                    'six': [True, False, True],
+                    'seven': [0.5, None, 1.0]})
+table1 = pa.Table.from_pandas(df1)
+
+# row group 2
+df2 = pd.DataFrame({'one': [4, 5, 6],
+                    'two': [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                    'three': ['uno', 'dos', 'tres'],
+                    'four': [datetime(2018, 1, 4),
+                             datetime(2018, 1, 5),
+                             datetime(2018, 1, 6)],
+                    'five': [date(2018, 1, 4),
+                             date(2018, 1, 5),
+                             date(2018, 1, 6)],
+                    'six': [False, False, False],
+                    'seven': [0.5, None, 1.0]})
+table2 = pa.Table.from_pandas(df2)
+
+with pq.ParquetWriter('example1.parquet', table1.schema) as writer:
+    writer.write_table(table1)
+    writer.write_table(table2)
+
+# example2.parquet file
+df3 = pd.DataFrame({'one': [1, 3, 5, 7, 9],
+                    'two': [[19, 20], [21, 22], [23, 24], [25, 26], [27, 28]],
+                    'three': ['eins', 'zwei', 'drei', 'vier', 'fünf'],
+                    'four': [datetime(2018, 1, 1),
+                             datetime(2018, 1, 3),
+                             datetime(2018, 1, 5),
+                             datetime(2018, 1, 7),
+                             datetime(2018, 1, 9)],
+                    'five': [date(2018, 1, 1),
+                             date(2018, 1, 3),
+                             date(2018, 1, 5),
+                             date(2018, 1, 7),
+                             date(2018, 1, 9)],
+                    'six': [True, False, True, False, True]})
+table3 = pa.Table.from_pandas(df3)
+
+with pq.ParquetWriter('example2.parquet', table3.schema) as writer:
+    writer.write_table(table3)
diff --git a/expected/.gitignore b/expected/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/input/import.source b/input/import.source
@@ -0,0 +1,33 @@
+SET datestyle = 'ISO';
+SET client_min_messages = WARNING;
+SET log_statement TO 'none';
+CREATE EXTENSION parquet_s3_fdw;
+DROP ROLE IF EXISTS regress_parquet_s3_fdw;
+CREATE ROLE regress_parquet_s3_fdw LOGIN SUPERUSER;
+CREATE SERVER parquet_s3_srv FOREIGN DATA WRAPPER parquet_s3_fdw;
+CREATE USER MAPPING FOR regress_parquet_s3_fdw SERVER parquet_s3_srv;
+SET ROLE regress_parquet_s3_fdw;
+
+-- import foreign schema
+IMPORT FOREIGN SCHEMA "@abs_srcdir@/data"
+FROM SERVER parquet_s3_srv
+INTO public
+OPTIONS (sorted 'one');
+\d
+SELECT * FROM example2;
+
+-- import_parquet
+create function list_parquet_s3_files(args jsonb)
+returns text[] as
+$$
+    select array[args->>'dir' || '/example1.parquet', args->>'dir' || '/example2.parquet']::text[];
+$$
+language sql;
+
+select import_parquet_s3('example_import', 'public', 'parquet_s3_srv', 'list_parquet_s3_files', '{"dir": "@abs_srcdir@/data"}', '{"sorted": "one"}');
+SELECT * FROM example_import ORDER BY one, three;
+select import_parquet_s3_explicit('example_import2', 'public', 'parquet_s3_srv', array['one', 'three', 'six'], array['int8', 'text', 'bool']::regtype[], 'list_parquet_s3_files', '{"dir": "@abs_srcdir@/data"}', '{"sorted": "one"}');
+SELECT * FROM example_import2;
+
+DROP FUNCTION list_parquet_s3_files;
+DROP EXTENSION parquet_s3_fdw CASCADE;