From aff0bcb4fe9e22427c2a70800997197d696ec6bb Mon Sep 17 00:00:00 2001 From: asr2003 <162500856+asr2003@users.noreply.github.com> Date: Mon, 21 Oct 2024 05:43:59 +0530 Subject: [PATCH 1/2] Add Documentation for Filtering Search Results and Partial Indices in Vector Search (#159) * docs: add filtering and partial index examples to vector search documentation * docs: add note for partial indices * update columns --- docs/api/search.md | 43 +++++++++++++++++++- extension/sql/example.sql | 84 ++++++++++++++++++++------------------- 2 files changed, 84 insertions(+), 43 deletions(-) diff --git a/docs/api/search.md b/docs/api/search.md index 60c73be..2a4abcb 100644 --- a/docs/api/search.md +++ b/docs/api/search.md @@ -76,8 +76,11 @@ select vectorize.table( ## Search a table -Search a table initialized with `vectorize.table`. The search results are sorted in descending order according to similarity. - The `query` is transformed to embeddings using the same `transformer` configured during `vectorize.table`. +Search a table initialized with `vectorize.table`. The search results are sorted in descending order according to similarity. + +The `query` is transformed to embeddings using the same `transformer` configured during `vectorize.table`. + +The `where_sql` parameter is used to apply additional filtering to the search results based on SQL conditions. ```sql vectorize."search"( @@ -86,6 +89,7 @@ vectorize."search"( "api_key" TEXT DEFAULT NULL, "return_columns" TEXT[] DEFAULT ARRAY['*']::text[], "num_results" INT DEFAULT 10 + "where_sql" TEXT DEFAULT NULL ) RETURNS TABLE ( "search_results" jsonb ) @@ -100,6 +104,7 @@ vectorize."search"( | api_key | text | API key for the specified chat model. If OpenAI, this value overrides the config `vectorize.openai_key` | | return_columns | text[] | The columns to return in the search results. Defaults to all columns. | | num_results | int | The number of results to return. Sorted in descending order according to similarity. Defaults to 10. | +| where_sql | text | An optional SQL condition to filter the search results. This condition is applied after the similarity search. | ### Example @@ -122,3 +127,37 @@ SELECT * FROM vectorize.search( {"product_id": 4, "product_name": "Bluetooth Speaker", "similarity_score": 0.8250355616233103} (3 rows) ``` + +## Filtering Search Results + +The `where_sql` parameter allows to apply SQL-based filtering after performing the vector similarity search. This feature is useful when you want to narrow down the search results based on certain conditions such as `product category` or `price`. + +### Example + +```sql +SELECT * FROM vectorize.search( + job_name => 'product_search', + query => 'mobile electronic devices', + return_columns => ARRAY['product_id', 'product_name'], + num_results => 3, + where_sql => 'product_category = ''electronics'' AND price > 100' +); +``` + +In the above example, the results are filtered where the `product_category` is `electronics` and the `price` is greater than 100. + +## Optimizing Searches with Partial Indices + +For improving performance when using filters, you can create partial indices. This will speed up the execution of queries with frequent conditions in the `where_sql` parameter. + +### Example + +```sql +CREATE INDEX idx_product_price ON products (product_name) WHERE price > 100; +``` + +This index optimizes queries that search for products where the `price` is greater than 100. + +> **Note:** Partial indices improve performance by only indexing rows that meet the specified condition. This reduces the amount of data the database needs to scan, making queries with the same filter more efficient since only relevant rows are included in the index. + +By combining the `where_sql` filtering feature with partial indices, you can efficiently narrow down search results and improve query performance. diff --git a/extension/sql/example.sql b/extension/sql/example.sql index ec1823c..c4fb4d3 100644 --- a/extension/sql/example.sql +++ b/extension/sql/example.sql @@ -2,48 +2,50 @@ CREATE TABLE example_products ( product_id SERIAL PRIMARY KEY, product_name TEXT NOT NULL, description TEXT, + product_category TEXT NOT NULL, + price DECIMAL(10, 2) NOT NULL, last_updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); -INSERT INTO example_products(product_name, description, last_updated_at) VALUES -('Pencil', 'Utensil used for writing and often works best on paper', NOW()), -('Laptop Stand', 'Elevated platform for laptops, enhancing ergonomics', NOW()), -('Desk Lamp', 'Illumination device for workspaces, often adjustable', NOW()), -('Bluetooth Speaker', 'Portable audio device with wireless connectivity', NOW()), -('Water Bottle', 'Reusable container for liquids, often insulated', NOW()), -('Backpack', 'Storage solution for carrying personal items on one’s back', NOW()), -('Wireless Mouse', 'Pointing device without the need for a physical connection', NOW()), -('Plant Pot', 'Container for holding plants, often with drainage', NOW()), -('Sunglasses', 'Protective eyewear to shield eyes from UV rays', NOW()), -('Notebook', 'Bound sheets of paper for note-taking or sketching', NOW()), -('Stylus Pen', 'Tool for touchscreen devices, mimics finger touch', NOW()), -('Travel Mug', 'Insulated container for beverages on-the-go', NOW()), -('Phone Charger', 'Device to replenish the battery of mobile phones', NOW()), -('Yoga Mat', 'Cushioned surface for practicing yoga or exercise', NOW()), -('Wall Clock', 'Time-telling device meant to hang on walls', NOW()), -('Keychain', 'Small device for holding keys together', NOW()), -('Desk Organizer', 'Tool for sorting and storing desk items', NOW()), -('Earbuds', 'Small headphones that fit directly inside the ear', NOW()), -('Calendar', 'Physical representation of days and months, often used for scheduling', NOW()), -('Umbrella', 'Protective gear against rain or intense sun', NOW()), -('Hand Sanitizer', 'Liquid or gel used to decrease infectious agents on hands', NOW()), -('Sketchbook', 'Paper-filled book used for drawing or painting', NOW()), -('Flash Drive', 'Portable storage device for digital files', NOW()), -('Tablet Holder', 'Stand or grip for holding tablets or e-readers', NOW()), -('Shampoo', 'Hair care product designed to cleanse the scalp and hair', NOW()), -('Wristwatch', 'Time-telling device worn around the wrist', NOW()), -('Basketball', 'Spherical sporting equipment used in basketball games', NOW()), -('Guitar Picks', 'Small flat tool used to strum or pick a guitar', NOW()), -('Thermal Flask', 'Insulated bottle for keeping beverages hot or cold', NOW()), -('Slippers', 'Soft and light footwear intended for indoor use', NOW()), -('Easel', 'Upright support for artists to display or work on canvases', NOW()), -('Bicycle Helmet', 'Protective headgear for cyclists', NOW()), -('Candle Holder', 'Accessory to safely hold candles when they burn', NOW()), -('Cutting Board', 'Durable board on which to place materials for cutting', NOW()), -('Gardening Gloves', 'Handwear for protection during gardening tasks', NOW()), -('Alarm Clock', 'Time-telling device with a feature to sound at a specified time', NOW()), -('Spatula', 'Flat tool used in cooking for flipping or spreading', NOW()), -('Jigsaw Puzzle', 'Picture printed on cardboard or wood and cut into pieces to be reassembled', NOW()), -('Hammock', 'Sling made of fabric or netting, suspended between two points for relaxation', NOW()), -('Luggage Tag', 'Accessory attached to luggage for identification purposes', NOW()) +INSERT INTO example_products(product_name, description, product_category, price, last_updated_at) VALUES +('Pencil', 'Utensil used for writing and often works best on paper', 'stationery', 1.50, NOW()), +('Laptop Stand', 'Elevated platform for laptops, enhancing ergonomics', 'electronics', 35.99, NOW()), +('Desk Lamp', 'Illumination device for workspaces, often adjustable', 'furniture', 22.50, NOW()), +('Bluetooth Speaker', 'Portable audio device with wireless connectivity', 'electronics', 99.99, NOW()), +('Water Bottle', 'Reusable container for liquids, often insulated', 'kitchenware', 15.00, NOW()), +('Backpack', 'Storage solution for carrying personal items on one’s back', 'accessories', 45.00, NOW()), +('Wireless Mouse', 'Pointing device without the need for a physical connection', 'electronics', 25.00, NOW()), +('Plant Pot', 'Container for holding plants, often with drainage', 'garden', 12.00, NOW()), +('Sunglasses', 'Protective eyewear to shield eyes from UV rays', 'accessories', 50.00, NOW()), +('Notebook', 'Bound sheets of paper for note-taking or sketching', 'stationery', 3.99, NOW()), +('Stylus Pen', 'Tool for touchscreen devices, mimics finger touch', 'electronics', 18.50, NOW()), +('Travel Mug', 'Insulated container for beverages on-the-go', 'kitchenware', 10.99, NOW()), +('Phone Charger', 'Device to replenish the battery of mobile phones', 'electronics', 20.00, NOW()), +('Yoga Mat', 'Cushioned surface for practicing yoga or exercise', 'sports', 30.00, NOW()), +('Wall Clock', 'Time-telling device meant to hang on walls', 'furniture', 15.50, NOW()), +('Keychain', 'Small device for holding keys together', 'accessories', 5.00, NOW()), +('Desk Organizer', 'Tool for sorting and storing desk items', 'furniture', 12.50, NOW()), +('Earbuds', 'Small headphones that fit directly inside the ear', 'electronics', 49.99, NOW()), +('Calendar', 'Physical representation of days and months, often used for scheduling', 'stationery', 10.00, NOW()), +('Umbrella', 'Protective gear against rain or intense sun', 'accessories', 8.99, NOW()), +('Hand Sanitizer', 'Liquid or gel used to decrease infectious agents on hands', 'personal care', 2.50, NOW()), +('Sketchbook', 'Paper-filled book used for drawing or painting', 'stationery', 6.99, NOW()), +('Flash Drive', 'Portable storage device for digital files', 'electronics', 12.00, NOW()), +('Tablet Holder', 'Stand or grip for holding tablets or e-readers', 'electronics', 22.99, NOW()), +('Shampoo', 'Hair care product designed to cleanse the scalp and hair', 'personal care', 7.50, NOW()), +('Wristwatch', 'Time-telling device worn around the wrist', 'accessories', 120.00, NOW()), +('Basketball', 'Spherical sporting equipment used in basketball games', 'sports', 20.00, NOW()), +('Guitar Picks', 'Small flat tool used to strum or pick a guitar', 'music', 5.00, NOW()), +('Thermal Flask', 'Insulated bottle for keeping beverages hot or cold', 'kitchenware', 18.99, NOW()), +('Slippers', 'Soft and light footwear intended for indoor use', 'footwear', 10.00, NOW()), +('Easel', 'Upright support for artists to display or work on canvases', 'art supplies', 45.00, NOW()), +('Bicycle Helmet', 'Protective headgear for cyclists', 'sports', 35.00, NOW()), +('Candle Holder', 'Accessory to safely hold candles when they burn', 'home decor', 15.00, NOW()), +('Cutting Board', 'Durable board on which to place materials for cutting', 'kitchenware', 10.50, NOW()), +('Gardening Gloves', 'Handwear for protection during gardening tasks', 'garden', 8.00, NOW()), +('Alarm Clock', 'Time-telling device with a feature to sound at a specified time', 'electronics', 25.00, NOW()), +('Spatula', 'Flat tool used in cooking for flipping or spreading', 'kitchenware', 3.99, NOW()), +('Jigsaw Puzzle', 'Picture printed on cardboard or wood and cut into pieces to be reassembled', 'toys', 12.99, NOW()), +('Hammock', 'Sling made of fabric or netting, suspended between two points for relaxation', 'outdoor', 40.00, NOW()), +('Luggage Tag', 'Accessory attached to luggage for identification purposes', 'travel', 7.50, NOW()) ; From 638b12887f14d47de0793b16d535b226d8f371b9 Mon Sep 17 00:00:00 2001 From: Adam Hendel Date: Tue, 22 Oct 2024 13:08:05 +0300 Subject: [PATCH 2/2] update test for new example table (#165) * update test for new example table * add missing migration sql * update versions * pg16 --- .github/workflows/extension_upgrade.yml | 2 +- README.md | 4 ++-- docs/examples/scheduling.md | 4 ++-- extension/Cargo.toml | 2 +- extension/Trunk.toml | 2 +- extension/sql/vectorize--0.18.3--0.19.0.sql | 0 extension/sql/vectorize--0.19.0--0.19.1.sql | 2 ++ extension/tests/integration_tests.rs | 16 ++++++++-------- 8 files changed, 17 insertions(+), 15 deletions(-) create mode 100644 extension/sql/vectorize--0.18.3--0.19.0.sql create mode 100644 extension/sql/vectorize--0.19.0--0.19.1.sql diff --git a/.github/workflows/extension_upgrade.yml b/.github/workflows/extension_upgrade.yml index 479a65e..48f9461 100644 --- a/.github/workflows/extension_upgrade.yml +++ b/.github/workflows/extension_upgrade.yml @@ -16,7 +16,7 @@ on: jobs: test: name: Upgrade Test - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 services: vector-serve: image: quay.io/tembo/vector-serve:latest diff --git a/README.md b/README.md index de6f9e8..26f3d30 100644 --- a/README.md +++ b/README.md @@ -244,8 +244,8 @@ Alternatively, `schedule => 'realtime` creates triggers on the source table and Statements below would will result in new embeddings being generated either immediately (`schedule => 'realtime'`) or within the cron schedule set in the `schedule` parameter. ```sql -INSERT INTO products (product_id, product_name, description) -VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread'); +INSERT INTO products (product_id, product_name, description, product_category, price) +VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread', 'food', 5.99); UPDATE products SET description = 'sling made of fabric, rope, or netting, suspended between two or more points, used for swinging, sleeping, or resting' diff --git a/docs/examples/scheduling.md b/docs/examples/scheduling.md index b77873c..b964bb0 100644 --- a/docs/examples/scheduling.md +++ b/docs/examples/scheduling.md @@ -9,8 +9,8 @@ Alternatively, `schedule => 'realtime` creates triggers on the source table and Statements below would will result in new embeddings being generated either immediately (`schedule => 'realtime'`) or within the cron schedule set in the `schedule` parameter. ```sql -INSERT INTO products (product_id, product_name, description) -VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread'); +INSERT INTO products (product_id, product_name, description, product_category, price) +VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread', 'food', 5.99); UPDATE products SET description = 'sling made of fabric, rope, or netting, suspended between two or more points, used for swinging, sleeping, or resting' diff --git a/extension/Cargo.toml b/extension/Cargo.toml index 3a2e328..cc2bcfa 100644 --- a/extension/Cargo.toml +++ b/extension/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorize" -version = "0.19.0" +version = "0.19.1" edition = "2021" publish = false diff --git a/extension/Trunk.toml b/extension/Trunk.toml index 164d00d..472d5d5 100644 --- a/extension/Trunk.toml +++ b/extension/Trunk.toml @@ -6,7 +6,7 @@ description = "The simplest way to orchestrate vector search on Postgres." homepage = "https://github.com/tembo-io/pg_vectorize" documentation = "https://github.com/tembo-io/pg_vectorize" categories = ["orchestration", "machine_learning"] -version = "0.19.0" +version = "0.19.1" loadable_libraries = [{ library_name = "vectorize", requires_restart = true }] [build] diff --git a/extension/sql/vectorize--0.18.3--0.19.0.sql b/extension/sql/vectorize--0.18.3--0.19.0.sql new file mode 100644 index 0000000..e69de29 diff --git a/extension/sql/vectorize--0.19.0--0.19.1.sql b/extension/sql/vectorize--0.19.0--0.19.1.sql new file mode 100644 index 0000000..56cf6dd --- /dev/null +++ b/extension/sql/vectorize--0.19.0--0.19.1.sql @@ -0,0 +1,2 @@ +ALTER TABLE vectorize.example_products ADD COLUMN product_category TEXT NOT NULL; +ALTER TABLE vectorize.example_products ADD COLUMN price DECIMAL(10, 2) NOT NULL; \ No newline at end of file diff --git a/extension/tests/integration_tests.rs b/extension/tests/integration_tests.rs index 02cb904..6361682 100644 --- a/extension/tests/integration_tests.rs +++ b/extension/tests/integration_tests.rs @@ -166,8 +166,8 @@ async fn test_realtime_job() { let random_product_id = rng.gen_range(0..100000); let insert_query = format!( - "INSERT INTO \"{test_table_name}\"(product_id, product_name, description) - VALUES ({random_product_id}, 'car tester', $$a product for testing car's components$$);" + "INSERT INTO \"{test_table_name}\"(product_id, product_name, description, product_category, price) + VALUES ({random_product_id}, 'car tester', $$a product for testing car's components$$, 'electronics', 10.99);" ); // insert a new row @@ -198,8 +198,8 @@ async fn test_realtime_job() { let random_product_id = rng.gen_range(0..100000); let insert_query = format!( - "INSERT INTO \"{test_table_name}\"(product_id, product_name, description) - VALUES ({random_product_id}, 'messy-product', $DELIM$the $$quick brown fox jump's over the lazy dog$DELIM$);" + "INSERT INTO \"{test_table_name}\"(product_id, product_name, description, product_category, price) + VALUES ({random_product_id}, 'messy-product', $DELIM$the $$quick brown fox jump's over the lazy dog$DELIM$, 'product', 10.99);" ); // insert a new row @@ -332,8 +332,8 @@ async fn test_static() { let random_product_id = rng.gen_range(1..100000); let insert_query = format!( - "INSERT INTO \"{test_table_name}\"(product_id, product_name, description) - VALUES ({random_product_id}, 'car tester', 'a product for testing cars');" + "INSERT INTO \"{test_table_name}\"(product_id, product_name, description, product_category, price) + VALUES ({random_product_id}, 'car tester', 'a product for testing cars', 'electronics', 10.99);" ); // insert a new row @@ -424,8 +424,8 @@ async fn test_realtime_tabled() { // insert a new row let insert_query = format!( - "INSERT INTO \"{test_table_name}\"(product_id, product_name, description) - VALUES ({random_product_id}, 'car tester', 'a product for testing cars');" + "INSERT INTO \"{test_table_name}\"(product_id, product_name, description, product_category, price) + VALUES ({random_product_id}, 'car tester', 'a product for testing cars', 'electronics', 10.99);" ); let _result = sqlx::query(&insert_query) .execute(&conn)