From 8818ef1c991d664f0947af4d1d58cfe10ede3aea Mon Sep 17 00:00:00 2001 From: Jason Petersen Date: Thu, 26 Sep 2024 10:42:09 -0600 Subject: [PATCH] Add make_view_incremental, tests, documentation, etc. (#28) Committing this functional subset to get the PR open and start working code into a release. --- META.json | 5 ++- Makefile | 4 +- Trunk.toml | 2 +- doc/guide.md | 2 +- doc/reference.md | 24 +++++++++++ sql/timeseries.sql | 77 +++++++++++++++++++++++++++++++++++ test/expected/basic_usage.out | 65 +++++++++++++++++++++++++++++ test/sql/basic_usage.sql | 31 ++++++++++++++ timeseries.control | 4 +- 9 files changed, 206 insertions(+), 8 deletions(-) diff --git a/META.json b/META.json index 8c62f3b..d7fe081 100644 --- a/META.json +++ b/META.json @@ -2,7 +2,7 @@ "name": "timeseries", "abstract": "Convenience API for Tembo time series stack", "description": "Convenience API for Tembo time series stack", - "version": "0.1.5", + "version": "0.1.6", "maintainer": [ "Jason Petersen " ], @@ -12,7 +12,7 @@ "abstract": "Convenience API for Tembo time series stack", "file": "sql/timeseries.sql", "docfile": "doc/timeseries.md", - "version": "0.1.5" + "version": "0.1.6" } }, "prereqs": { @@ -23,6 +23,7 @@ "recommends": { "columnar": "1.1.2", "pg_cron": "1.6.2", + "pg_ivm": "1.9.0", "pg_partman": "4.5.1" } } diff --git a/Makefile b/Makefile index 42b6602..b8d6985 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ EXTVERSION = $(shell grep -m 1 '[[:space:]]\{6\}"version":' META.json | \ DISTVERSION = $(shell grep -m 1 '[[:space:]]\{3\}"version":' META.json | \ sed -e 's/[[:space:]]*"version":[[:space:]]*"\([^"]*\)",\{0,1\}/\1/') -EXTVERSIONS = 0.1.5 +EXTVERSIONS = 0.1.6 DATA = $(wildcard sql/*--*.sql) DATA_built = $(foreach v,$(EXTVERSIONS),sql/$(EXTENSION)--$(v).sql) @@ -30,5 +30,5 @@ latest-changes.md: Changes # generate each version's file installation file by concatenating # previous upgrade scripts -sql/$(EXTENSION)--0.1.5.sql: sql/$(EXTENSION).sql +sql/$(EXTENSION)--0.1.6.sql: sql/$(EXTENSION).sql cat $^ > $@ diff --git a/Trunk.toml b/Trunk.toml index 1bb0bb1..6844e4c 100644 --- a/Trunk.toml +++ b/Trunk.toml @@ -6,7 +6,7 @@ description = "Open source timeseries extension for Postgres." homepage = "https://github.com/tembo-io/pg_timeseries" documentation = "https://github.com/tembo-io/pg_timeseries" categories = ["analytics"] -version = "0.1.5" +version = "0.1.6" [build] postgres_version = "15" diff --git a/doc/guide.md b/doc/guide.md index 729d65a..223f6ad 100644 --- a/doc/guide.md +++ b/doc/guide.md @@ -4,7 +4,7 @@ In this guide, you will become familiar with the functions and features of the t ## Preparing your database -You'll need a PostgreSQL instance running timeseries `0.1.5` or later. An easy way to have one set up for you is to deploy one from Tembo Cloud [here](https://cloud.tembo.io). The free tier will perform well enough for the data set we'll be using. +You'll need a PostgreSQL instance running timeseries `0.1.6` or later. An easy way to have one set up for you is to deploy one from Tembo Cloud [here](https://cloud.tembo.io). The free tier will perform well enough for the data set we'll be using. Once that's up and running, you'll need a client machine with `psql` (to connect to your database) and [the Divvy dataset](https://tembo-demo-bucket.s3.amazonaws.com/202004--202402-divvy-tripdata-slim.csv), which will total about 50MiB of CSV after decompression. diff --git a/doc/reference.md b/doc/reference.md index 7a7a958..f5e47e8 100644 --- a/doc/reference.md +++ b/doc/reference.md @@ -132,6 +132,30 @@ Clears the compression policy for a table (so its data will never be dropped). `interval`, the previous policy for this table, or `NULL` if none was set. +## Incremental Views + +These functions deal with the creation and maintenance of incremental views, or more verbosely, _incrementally maintainable materialized views_ (IMMVs). The underlying functionality is provided the the `pg_ivm` ("PostgreSQL Incremental View Maintenance") project, which handles applying changes toward any partial aggregates and groups which are affected by incoming `INSERT`, `UPDATE`, `DELETE`, etc. (DML) statements. + +Within `pg_timeseries`, it is expected that these functions be applied against existing time-series enhanced tables. + +### `make_view_incremental` + +This function takes an existing view—typically used to calculate aggregates across a time-series data set—and makes it into an incremental view: one whose aggregates are stored in a materialized fashion, but which is kept up-to-date as every individual modification to the underlying table is processed. + +After this function exits, the same view will exist, but will now point at an _incrementally maintainable materialized view_. This view maintains its correctness through a series of statement triggers on the time-series table, meaning that for maximum performance, some degree of row batching should be used for ingest. + +#### Arguments + + * `target_view_id` (`regclass`), **required** — a view aggregating a single existing time-series table + +#### Considerations + +Views which reference more than one table are presently forbidden, though this restriction may be lifted at a later date. + +#### Returns + +`void`. If any problems are encountered, an error will be raised. + ## Analytics Functions These functions are not related to the maintenance of time-series tables, but do sometimes rely on the related metadata to function. They are intended to make time-series queries easier to read and maintain. diff --git a/sql/timeseries.sql b/sql/timeseries.sql index 1eba835..b8911dc 100644 --- a/sql/timeseries.sql +++ b/sql/timeseries.sql @@ -613,3 +613,80 @@ CREATE AGGREGATE locf(anyelement) ( SFUNC = locf_agg, STYPE = anyelement ); + +-- A wrapper around the incremental view functionality provided by pg_ivm. This +-- function accepts a pointer to an existing view (presumably one that queries +-- a time-series table) and rewrites that view to point at an underlying IMMV +-- instead. The purpose of this is to avoid exposing users to implementation- +-- specific columns used to track partial aggregate state within IMMV tables: +-- these columns may confuse users or interfere with software which expects +-- all visible columns to be meaningful to an application (say, dashboards). +-- +-- For the record, IMMV means "incrementally maintainable materialized view", +-- though at the moment these objects are actually plain tables. They are +-- created by calling functions from the pg_ivm project ("PostgreSQL incre- +-- mental view maintenance"). After creation, these views automatically keep +-- aggregates up-to-date with the latest data written to any referenced tables. +-- This is an improvement over plain views (which reaggregate on every scan) +-- and materialized views (which require periodic calls to REFRESH). +-- +-- Within this wrapper, the provided view's SQL is extracted, then a new IMMV +-- is created using the extracted query. Because this IMMV will have at least +-- one (sometimes two or more) internal column per aggregate (beginning with +-- the prefix '__ivm_'), it is desirable to allow the user to query a wrapper +-- view instead. So the original view's query is redefined to point at the +-- IMMV table, but restricted solely to the visible columns of the original +-- query. Big picture: the user can create an aggregating view, check it +-- quickly (with a LIMIT, for instance), then make it incremental once they +-- have verified it's producing the aggregate rows they want. +CREATE OR REPLACE FUNCTION public.make_view_incremental(target_view_id regclass) + RETURNS void + LANGUAGE plpgsql +AS $function$ +DECLARE + orig_view_sql text; + immv_name text; + columns_sql text; + old_client_msg text; + old_log_msg text; +BEGIN + -- check that target_view_id is actually a view + -- check that target_view_id mentions only one table + + -- get the original SQL definition of the view to be upgraded + SELECT pg_get_viewdef(target_view_id) INTO orig_view_sql; + + -- build the name we'll use for the new immv + SELECT format('%I.%I', n.nspname, c.relname || '_immv') + INTO immv_name + FROM pg_class c + LEFT JOIN pg_namespace n + ON n.oid = c.relnamespace + WHERE c.oid=target_view_id; + + -- pg_ivm is chatty, even at NOTICE, so turn it down + SELECT current_setting('client_min_messages') INTO old_client_msg; + SELECT current_setting('log_min_messages') INTO old_log_msg; + SET LOCAL client_min_messages TO warning; + SET LOCAL log_min_messages TO warning; + + -- create an immv with the original view's SQL + PERFORM create_immv(immv_name, orig_view_sql); + + -- restore previous client/log levels + EXECUTE format('SET LOCAL client_min_messages TO %s', old_client_msg); + EXECUTE format('SET LOCAL log_min_messages TO %s', old_log_msg); + + -- build a list of visible columns for the view + SELECT string_agg(quote_ident(attname), ', ' ORDER BY attnum ASC) + INTO columns_sql + FROM pg_attribute + WHERE attnum > 0 AND + NOT attisdropped AND + attrelid=target_view_id; + + -- modify the existing view to point at the immv, hiding internal columns + EXECUTE format('CREATE OR REPLACE VIEW %I AS SELECT %s FROM %s', + target_view_id, columns_sql, immv_name); +END; +$function$; diff --git a/test/expected/basic_usage.out b/test/expected/basic_usage.out index 15b1e31..cfdb6d5 100644 --- a/test/expected/basic_usage.out +++ b/test/expected/basic_usage.out @@ -238,3 +238,68 @@ ORDER BY 3; 2 | 1.9000000000000001 | Wed Nov 04 15:59:00 2020 PST (9 rows) +CREATE VIEW events_5m AS + SELECT + user_id, + date_bin('5 minutes', + event_time, + TIMESTAMPTZ '1970-01-01') AS event_time, + max(value), + min(value) + FROM events + GROUP BY 1, 2; +CREATE VIEW events_totals AS + SELECT + user_id, + sum(value), + count(user_id) + FROM events + GROUP BY 1; +SELECT make_view_incremental('events_5m'); + make_view_incremental +----------------------- + +(1 row) + +SELECT make_view_incremental('events_totals'); + make_view_incremental +----------------------- + +(1 row) + +SELECT * FROM events_5m ORDER BY 1, 2; + user_id | event_time | max | min +---------+------------------------------+-----+----- + 1 | Wed Nov 04 15:50:00 2020 PST | 1.2 | 1.1 + 1 | Wed Nov 04 15:55:00 2020 PST | 1.6 | 1.3 + 2 | Wed Nov 04 15:50:00 2020 PST | 1.8 | 1.7 + 2 | Wed Nov 04 15:55:00 2020 PST | 2.2 | 1.9 +(4 rows) + +SELECT * FROM events_totals ORDER BY 1; + user_id | sum | count +---------+------+------- + 1 | 8.1 | 6 + 2 | 11.7 | 6 +(2 rows) + +INSERT INTO events VALUES (3, 1, '2020-11-04 15:51:02.226999-08', 1.1); +DELETE FROM events WHERE event_id = 12; +SELECT * FROM events_5m ORDER BY 1, 2; + user_id | event_time | max | min +---------+------------------------------+-----+----- + 1 | Wed Nov 04 15:50:00 2020 PST | 1.2 | 1.1 + 1 | Wed Nov 04 15:55:00 2020 PST | 1.6 | 1.3 + 2 | Wed Nov 04 15:50:00 2020 PST | 1.8 | 1.7 + 2 | Wed Nov 04 15:55:00 2020 PST | 2.1 | 1.9 + 3 | Wed Nov 04 15:50:00 2020 PST | 1.1 | 1.1 +(5 rows) + +SELECT * FROM events_totals ORDER BY 1; + user_id | sum | count +---------+-----+------- + 1 | 8.1 | 6 + 2 | 9.5 | 5 + 3 | 1.1 | 1 +(3 rows) + diff --git a/test/sql/basic_usage.sql b/test/sql/basic_usage.sql index d7f4643..8bfb104 100644 --- a/test/sql/basic_usage.sql +++ b/test/sql/basic_usage.sql @@ -99,3 +99,34 @@ FROM date_bin_table(NULL::events, '1 minute', '(2020-11-04 15:50:00-08, 2020-11-04 16:00:00-08)') GROUP BY 3 ORDER BY 3; + +CREATE VIEW events_5m AS + SELECT + user_id, + date_bin('5 minutes', + event_time, + TIMESTAMPTZ '1970-01-01') AS event_time, + max(value), + min(value) + FROM events + GROUP BY 1, 2; + +CREATE VIEW events_totals AS + SELECT + user_id, + sum(value), + count(user_id) + FROM events + GROUP BY 1; + +SELECT make_view_incremental('events_5m'); +SELECT make_view_incremental('events_totals'); + +SELECT * FROM events_5m ORDER BY 1, 2; +SELECT * FROM events_totals ORDER BY 1; + +INSERT INTO events VALUES (3, 1, '2020-11-04 15:51:02.226999-08', 1.1); +DELETE FROM events WHERE event_id = 12; + +SELECT * FROM events_5m ORDER BY 1, 2; +SELECT * FROM events_totals ORDER BY 1; diff --git a/timeseries.control b/timeseries.control index 1aa7027..8c87f27 100644 --- a/timeseries.control +++ b/timeseries.control @@ -1,5 +1,5 @@ # time series extension comment = 'Convenience API for Tembo time series stack' -default_version = '0.1.5' +default_version = '0.1.6' module_pathname = '$libdir/timeseries' -requires = 'columnar, pg_cron, pg_partman' +requires = 'columnar, pg_cron, pg_ivm, pg_partman'