From 75b90397ea44d0c2be0a8c00ac3ee791bc6bb6a3 Mon Sep 17 00:00:00 2001 From: analyzer1 Date: Mon, 23 Sep 2024 13:21:29 -0400 Subject: [PATCH] Added hashing function and deprecated PGCRYPTO. (#9) --- extension/Cargo.toml | 2 + extension/pg_auto_dw.control | 1 - .../src/controller/bgw_transformer_client.rs | 9 +- extension/src/controller/dv_loader.rs | 25 +- extension/src/lib.rs | 11 + extension/src/model/mod.rs | 3 +- extension/src/model/prompt_template.rs | 314 +++++++++++++++++ extension/src/utility/ollama_client.rs | 318 +----------------- extension/src/utility/openai_client.rs | 316 +---------------- 9 files changed, 346 insertions(+), 653 deletions(-) create mode 100644 extension/src/model/prompt_template.rs diff --git a/extension/Cargo.toml b/extension/Cargo.toml index 7c9f990..799281e 100644 --- a/extension/Cargo.toml +++ b/extension/Cargo.toml @@ -26,6 +26,8 @@ uuid = { version = "1.1", features = ["v4", "v5", "serde"] } chrono = { version = "0.4", features = ["serde"] } anyhow = "1.0" regex = "1.7" +sha2 = "0.10" +hex = "0.4" [dev-dependencies] pgrx-tests = "=0.11.4" diff --git a/extension/pg_auto_dw.control b/extension/pg_auto_dw.control index 911d3d6..949a5dc 100644 --- a/extension/pg_auto_dw.control +++ b/extension/pg_auto_dw.control @@ -4,4 +4,3 @@ module_pathname = '$libdir/pg_auto_dw' relocatable = false superuser = true schema = 'auto_dw' -requires = 'pgcrypto' diff --git a/extension/src/controller/bgw_transformer_client.rs b/extension/src/controller/bgw_transformer_client.rs index bba1de7..339c61e 100644 --- a/extension/src/controller/bgw_transformer_client.rs +++ b/extension/src/controller/bgw_transformer_client.rs @@ -6,8 +6,7 @@ use std::collections::HashMap; use tokio::runtime::Runtime; use serde::Deserialize; -use crate::queries; -use crate::model::source_objects; +use crate::model::*; // use crate::utility::ollama_client; use crate::utility::openai_client; use crate::utility::guc; @@ -75,7 +74,7 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { while retries < MAX_TRANSFORMER_RETRIES { runtime.block_on(async { // Get Generation - generation_json_bk_identification = match openai_client::send_request(table_details_json_str.as_str(), openai_client::PromptTemplate::BKIdentification, &0, &hints).await { + generation_json_bk_identification = match openai_client::send_request(table_details_json_str.as_str(), prompt_template::PromptTemplate::BKIdentification, &0, &hints).await { Ok(response_json) => { // TODO: Add a function to enable logging. @@ -123,7 +122,7 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { while retries < MAX_TRANSFORMER_RETRIES { runtime.block_on(async { // Get Generation - generation_json_bk_name = match openai_client::send_request(table_details_json_str.as_str(), openai_client::PromptTemplate::BKName, &0, &hints).await { + generation_json_bk_name = match openai_client::send_request(table_details_json_str.as_str(), prompt_template::PromptTemplate::BKName, &0, &hints).await { Ok(response_json) => { // let response_json_pretty = serde_json::to_string_pretty(&response_json) @@ -174,7 +173,7 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { generation_json_descriptor_sensitive = match openai_client::send_request( table_details_json_str.as_str(), - openai_client::PromptTemplate::DescriptorSensitive, + prompt_template::PromptTemplate::DescriptorSensitive, column, &hints).await { Ok(response_json) => { diff --git a/extension/src/controller/dv_loader.rs b/extension/src/controller/dv_loader.rs index f87e717..c2a8edc 100644 --- a/extension/src/controller/dv_loader.rs +++ b/extension/src/controller/dv_loader.rs @@ -118,14 +118,14 @@ fn dv_data_loader_hub_dml (dv_schema: &DVSchema) -> String { FROM {}.hub_{} ) SELECT - ENCODE(PUBLIC.DIGEST(ARRAY_TO_STRING(ARRAY[-1], ',')::TEXT, 'sha256'), 'hex') AS hub_{}_hk, + auto_dw.hash(ARRAY_TO_STRING(ARRAY[-1], ',')::TEXT) AS hub_{}_hk, '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts, 'SYSTEM'::TEXT AS record_source {} FROM initialized WHERE NOT initialized.is_initialized UNION SELECT - ENCODE(PUBLIC.DIGEST(ARRAY_TO_STRING(ARRAY[-2], ',')::TEXT, 'sha256'), 'hex') AS hub_{}_hk, + auto_dw.hash(ARRAY_TO_STRING(ARRAY[-2], ',')::TEXT) AS hub_{}_hk, '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts, 'SYSTEM'::TEXT AS record_source {} @@ -168,10 +168,9 @@ fn dv_data_loader_hub_dml (dv_schema: &DVSchema) -> String { WITH stg_data AS ( SELECT - ENCODE( - public.DIGEST( - ARRAY_TO_STRING( - ARRAY[{}], ','), 'sha256'), 'hex') AS hub_{}_hk, + auto_dw.hash( + ARRAY_TO_STRING(ARRAY[{}], ',') + ) AS hub_{}_hk, (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts, '{}' AS record_source{} FROM {}.{} AS stg @@ -330,14 +329,12 @@ fn dv_data_loader_sat_dml (dv_schema: &DVSchema) -> String { WITH stg AS ( SELECT *, - ENCODE( - {source_schema_name}.DIGEST( - ARRAY_TO_STRING( - ARRAY[{hub_bk_parts_sql_stg_array}], ','), 'sha256'), 'hex') AS hub_{business_key_name}_hk, - ENCODE( - {source_schema_name}.DIGEST( - ARRAY_TO_STRING( - ARRAY[{sat_source_sql_array}], ','), 'sha256'), 'hex') AS sat_{key}_hd + auto_dw.hash( + ARRAY_TO_STRING(ARRAY[{hub_bk_parts_sql_stg_array}], ',') + ) AS hub_{business_key_name}_hk, + auto_dw.hash( + ARRAY_TO_STRING(ARRAY[{sat_source_sql_array}], ',') + ) AS sat_{key}_hd FROM {source_schema_name}.{source_table_name} AS stg ), new_stg_data AS ( diff --git a/extension/src/lib.rs b/extension/src/lib.rs index 7755d79..7e8241d 100644 --- a/extension/src/lib.rs +++ b/extension/src/lib.rs @@ -5,6 +5,9 @@ mod utility; // Initialization, Configuration Management, and External Servic pub use pgrx::prelude::*; use uuid::Uuid; +use sha2::{Sha256, Digest}; +use hex; + pgrx::pg_module_magic!(); use model::queries; @@ -134,6 +137,14 @@ fn source_column() -> Result< .map(TableIterator::new) } +#[pg_extern] +fn hash(inputs: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(inputs.as_bytes()); + let digest = hasher.finalize(); + hex::encode(digest) +} + #[cfg(any(test, feature = "pg_test"))] #[pg_schema] mod tests { diff --git a/extension/src/model/mod.rs b/extension/src/model/mod.rs index 7c9c8c9..e811788 100644 --- a/extension/src/model/mod.rs +++ b/extension/src/model/mod.rs @@ -1,3 +1,4 @@ pub mod source_objects; pub mod dv_schema; -pub mod queries; \ No newline at end of file +pub mod queries; +pub mod prompt_template; \ No newline at end of file diff --git a/extension/src/model/prompt_template.rs b/extension/src/model/prompt_template.rs new file mode 100644 index 0000000..c0ac173 --- /dev/null +++ b/extension/src/model/prompt_template.rs @@ -0,0 +1,314 @@ +#[derive(Debug)] +pub enum PromptTemplate { + BKIdentification, + BKName, + DescriptorSensitive, +} + +impl PromptTemplate { + pub fn template(&self) -> &str { + match self { + PromptTemplate::BKIdentification => r#" + Task Title: Business Key Identification in JSON Source Table Object + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. + + Requested Task: Identify the column number most likely to serve as the business key. Return only one column in JSON format as specified below. + + + Request Details: + If the column is a primary key, assume it is the business key. If not, choose the column most likely to uniquely identify the table’s entity. Additionally, provide a confidence value for your selection. + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in the selected column. A value of 0.80 or higher is considered reasonably confident. + + + Reason: Indicate why you made the decision you did. + + Output: Ensure the output conforms to the format shown in the examples below. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Example Output 1) + { + "Identified Business Key": { + "Column No": 1, + "Confidence Value": 0.95, + "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "sales", + "Table Name": "order_details", + "Column Details": [ + "Column No: 1 Named: order_id of type: integer Column Comments: NA", + "Column No: 2 Named: product_id of type: integer Column Comments: NA", + "Column No: 3 Named: quantity of type: integer Column Comments: NA", + "Column No: 4 Named: order_date of type: date Column Comments: NA" + ] + } + + Example Output 2) + { + "Identified Business Key": { + "Column No": 1, + "Confidence Value": 0.75, + "Reason": "Although 'order_id' is not explicitly marked as a primary key, it is likely to uniquely identify each order, making it a strong candidate for the business key." + } + } + + Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} + + JSON Source Table Object: {new_json} + "#, + PromptTemplate::BKName => r#" + Task Title: Business Key Naming in JSON Source Table Object with specified Column + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. + + Requested Task: Identify the business key name. The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object. Return a name that best represents the business key from a data vault perspective. + + Request Details: + + The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name. + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident. + + + Reason: Indicate why you made the decision you did. + + Output: Ensure the output conforms to the format shown in the examples below. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 1 + + Example Output 1) + { + "Business Key Name": { + "Name": "Customer", + "Confidence Value": 0.9, + "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "sales", + "Table Name": "order_details", + "Column Details": [ + "Column No: 1 Named: id of type: integer Column Comments: NA", + "Column No: 2 Named: product_id of type: integer Column Comments: NA", + "Column No: 3 Named: quantity of type: integer Column Comments: NA", + "Column No: 4 Named: order_date of type: date Column Comments: NA" + ] + } + + Column No: 1 + + Example Output 2) + { + "Business Key Name": { + "Name": "Order", + "Confidence Value": 0.85, + "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name." + } + } + + Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} + + JSON Source Table Object: {new_json} + + Column No: {column_no} + "#, + PromptTemplate::DescriptorSensitive => r#" + Task Title: Identification of PII in JSON Source Table Object + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information. + + Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.” + + Request Details: + PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII. + No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list. + + Personal Identifiable Information (PII) List: + + Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”: + + - Person’s Name: PII (Includes first name, last name, or both). + - Social Security Number (SSN): PII + - Driver’s License Number: PII + - Passport Number: PII + - Email Address: PII + - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code). + - Extended Zip Code: PII (Any Zip code with more than 5 digits). + - Telephone Number: PII (Includes both landline and mobile numbers). + - Date of Birth: PII + - Place of Birth: PII + - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans). + - Medical Information: PII (Includes health records, prescriptions). + - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers). + - Employment Information: PII (Includes employment records, salary information). + - Insurance Information: PII (Includes policy numbers, claim information). + - Education Records: PII (Includes student records, transcripts). + - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses). + - Photographs or Videos: PII (Any media that can identify an individual). + - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK). + - Geolocation Data: PII (Includes GPS coordinates, location history). + - Vehicle Registration Numbers: PII + + Not PII: + + Some data may seem personally identifiable; however, it is not specific enough to identify an individual. + + - Standard 5-Digit Zip Code: Not PII + - City: Not PII + - State: Not PII + - Country: Not PII + - Age (in years): Not PII (Unless combined with other identifiers like date of birth). + - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth) + - Gender: Not PII + - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers). + - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records). + - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details). + - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers). + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer. + + + Reason: Indicate why you made the decision you did. + + Output: Please ensure that your output is JSON and matches the structure of the output examples provided. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 4 + + Example Output 1) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.85, + "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 2 + + Example Output 2) + { + "Descriptor - Sensitive": { + "Is PII": false, + "Confidence Value": 0.90, + "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list." + } + } + + Example Input 3) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "employee", + "Column Details": [ + "Column No: 1 Named: employee_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: email of type: character varying(255) Column Comments: NA", + "Column No: 4 Named: salary of type: numeric Column Comments: NA" + ] + } + + Column No: 2 + + Example Output 3) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.95, + "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list." + } + } + + Example Input 4) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "order", + "Column Details": [ + "Column No: 1 Named: order_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: order_date of type: date Column Comments: NA", + "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'", + "Column No: 4 Named: total_amount of type: numeric Column Comments: NA" + ] + } + + Column No: 3 + + Example Output 4) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.98, + "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list." + } + } + + Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs. {hints} + + JSON Source Table Object: {new_json} + + Column No: {column_no} + + "#, + } + } +} \ No newline at end of file diff --git a/extension/src/utility/ollama_client.rs b/extension/src/utility/ollama_client.rs index e64d99a..e5694a4 100644 --- a/extension/src/utility/ollama_client.rs +++ b/extension/src/utility/ollama_client.rs @@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize}; use std::time::Duration; use crate::utility::guc; +use crate::model::prompt_template::PromptTemplate; use pgrx::prelude::*; @@ -75,320 +76,3 @@ pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u Ok(response_json) } - -#[derive(Debug)] -pub enum PromptTemplate { - BKIdentification, - BKName, - DescriptorSensitive, -} - -impl PromptTemplate { - fn template(&self) -> &str { - match self { - PromptTemplate::BKIdentification => r#" - Task Title: Business Key Identification in JSON Source Table Object - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. - - Requested Task: Identify the column number most likely to serve as the business key. Return only one column in JSON format as specified below. - - - Request Details: - If the column is a primary key, assume it is the business key. If not, choose the column most likely to uniquely identify the table’s entity. Additionally, provide a confidence value for your selection. - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in the selected column. A value of 0.80 or higher is considered reasonably confident. - - - Reason: Indicate why you made the decision you did. - - Output: Ensure the output conforms to the format shown in the examples below. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Example Output 1) - { - "Identified Business Key": { - "Column No": 1, - "Confidence Value": 0.95, - "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "sales", - "Table Name": "order_details", - "Column Details": [ - "Column No: 1 Named: order_id of type: integer Column Comments: NA", - "Column No: 2 Named: product_id of type: integer Column Comments: NA", - "Column No: 3 Named: quantity of type: integer Column Comments: NA", - "Column No: 4 Named: order_date of type: date Column Comments: NA" - ] - } - - Example Output 2) - { - "Identified Business Key": { - "Column No": 1, - "Confidence Value": 0.75, - "Reason": "Although 'order_id' is not explicitly marked as a primary key, it is likely to uniquely identify each order, making it a strong candidate for the business key." - } - } - - Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} - - JSON Source Table Object: {new_json} - "#, - PromptTemplate::BKName => r#" - Task Title: Business Key Naming in JSON Source Table Object with specified Column - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. - - Requested Task: Identify the business key name. The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object. Return a name that best represents the business key from a data vault perspective. - - Request Details: - - The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name. - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident. - - - Reason: Indicate why you made the decision you did. - - Output: Ensure the output conforms to the format shown in the examples below. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 1 - - Example Output 1) - { - "Business Key Name": { - "Name": "Customer", - "Confidence Value": 0.9, - "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "sales", - "Table Name": "order_details", - "Column Details": [ - "Column No: 1 Named: id of type: integer Column Comments: NA", - "Column No: 2 Named: product_id of type: integer Column Comments: NA", - "Column No: 3 Named: quantity of type: integer Column Comments: NA", - "Column No: 4 Named: order_date of type: date Column Comments: NA" - ] - } - - Column No: 1 - - Example Output 2) - { - "Business Key Name": { - "Name": "Order", - "Confidence Value": 0.85, - "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name." - } - } - - Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} - - JSON Source Table Object: {new_json} - - Column No: {column_no} - "#, - PromptTemplate::DescriptorSensitive => r#" - Task Title: Identification of PII in JSON Source Table Object - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information. - - Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.” - - Request Details: - PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII. - No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list. - - Personal Identifiable Information (PII) List: - - Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”: - - - Person’s Name: PII (Includes first name, last name, or both). - - Social Security Number (SSN): PII - - Driver’s License Number: PII - - Passport Number: PII - - Email Address: PII - - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code). - - Extended Zip Code: PII (Any Zip code with more than 5 digits). - - Telephone Number: PII (Includes both landline and mobile numbers). - - Date of Birth: PII - - Place of Birth: PII - - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans). - - Medical Information: PII (Includes health records, prescriptions). - - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers). - - Employment Information: PII (Includes employment records, salary information). - - Insurance Information: PII (Includes policy numbers, claim information). - - Education Records: PII (Includes student records, transcripts). - - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses). - - Photographs or Videos: PII (Any media that can identify an individual). - - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK). - - Geolocation Data: PII (Includes GPS coordinates, location history). - - Vehicle Registration Numbers: PII - - Not PII: - - Some data may seem personally identifiable; however, it is not specific enough to identify an individual. - - - Standard 5-Digit Zip Code: Not PII - - City: Not PII - - State: Not PII - - Country: Not PII - - Age (in years): Not PII (Unless combined with other identifiers like date of birth). - - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth) - - Gender: Not PII - - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers). - - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records). - - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details). - - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers). - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer. - - - Reason: Indicate why you made the decision you did. - - Output: Please ensure that your output is JSON and matches the structure of the output examples provided. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 4 - - Example Output 1) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.85, - "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 2 - - Example Output 2) - { - "Descriptor - Sensitive": { - "Is PII": false, - "Confidence Value": 0.90, - "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list." - } - } - - Example Input 3) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "employee", - "Column Details": [ - "Column No: 1 Named: employee_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: email of type: character varying(255) Column Comments: NA", - "Column No: 4 Named: salary of type: numeric Column Comments: NA" - ] - } - - Column No: 2 - - Example Output 3) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.95, - "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list." - } - } - - Example Input 4) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "order", - "Column Details": [ - "Column No: 1 Named: order_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: order_date of type: date Column Comments: NA", - "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'", - "Column No: 4 Named: total_amount of type: numeric Column Comments: NA" - ] - } - - Column No: 3 - - Example Output 4) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.98, - "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list." - } - } - - Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs. {hints} - - JSON Source Table Object: {new_json} - - Column No: {column_no} - - "#, - } - } -} - - diff --git a/extension/src/utility/openai_client.rs b/extension/src/utility/openai_client.rs index 1bf9608..09f9083 100644 --- a/extension/src/utility/openai_client.rs +++ b/extension/src/utility/openai_client.rs @@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize}; use std::time::Duration; use crate::utility::guc; +use crate::model::prompt_template::PromptTemplate; use pgrx::prelude::*; #[derive(Serialize, Debug)] @@ -126,321 +127,6 @@ pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u Ok(content_json) } -#[derive(Debug)] -pub enum PromptTemplate { - BKIdentification, - BKName, - DescriptorSensitive, - Test, -} - -impl PromptTemplate { - fn template(&self) -> &str { - match self { - PromptTemplate::BKIdentification => r#" - Task Title: Business Key Identification in JSON Source Table Object - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. - - Requested Task: Identify the column number most likely to serve as the business key. Return only one column in JSON format as specified below. - - - Request Details: - If the column is a primary key, assume it is the business key. If not, choose the column most likely to uniquely identify the table’s entity. Additionally, provide a confidence value for your selection. - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in the selected column. A value of 0.80 or higher is considered reasonably confident. - - - Reason: Indicate why you made the decision you did. - - Output: Ensure the output conforms to the format shown in the examples below. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Example Output 1) - { - "Identified Business Key": { - "Column No": 1, - "Confidence Value": 0.95, - "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "sales", - "Table Name": "order_details", - "Column Details": [ - "Column No: 1 Named: order_id of type: integer Column Comments: NA", - "Column No: 2 Named: product_id of type: integer Column Comments: NA", - "Column No: 3 Named: quantity of type: integer Column Comments: NA", - "Column No: 4 Named: order_date of type: date Column Comments: NA" - ] - } - - Example Output 2) - { - "Identified Business Key": { - "Column No": 1, - "Confidence Value": 0.75, - "Reason": "Although 'order_id' is not explicitly marked as a primary key, it is likely to uniquely identify each order, making it a strong candidate for the business key." - } - } - - Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} - - JSON Source Table Object: {new_json} - "#, - PromptTemplate::BKName => r#" - Task Title: Business Key Naming in JSON Source Table Object with specified Column - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. - - Requested Task: Identify the business key name. The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object. Return a name that best represents the business key from a data vault perspective. - - Request Details: - - The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name. - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident. - - - Reason: Indicate why you made the decision you did. - - Output: Ensure the output conforms to the format shown in the examples below. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 1 - - Example Output 1) - { - "Business Key Name": { - "Name": "Customer", - "Confidence Value": 0.9, - "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "sales", - "Table Name": "order_details", - "Column Details": [ - "Column No: 1 Named: id of type: integer Column Comments: NA", - "Column No: 2 Named: product_id of type: integer Column Comments: NA", - "Column No: 3 Named: quantity of type: integer Column Comments: NA", - "Column No: 4 Named: order_date of type: date Column Comments: NA" - ] - } - - Column No: 1 - - Example Output 2) - { - "Business Key Name": { - "Name": "Order", - "Confidence Value": 0.85, - "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name." - } - } - - Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} - JSON Source Table Object: {new_json} - - Column No: {column_no} - "#, - PromptTemplate::DescriptorSensitive => r#" - Task Title: Identification of PII in JSON Source Table Object - - You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information. - - Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.” - - Request Details: - PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII. - No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list. - - Personal Identifiable Information (PII) List: - - Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”: - - - Person’s Name: PII (Includes first name, last name, or both). - - Social Security Number (SSN): PII - - Driver’s License Number: PII - - Passport Number: PII - - Email Address: PII - - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code). - - Extended Zip Code: PII (Any Zip code with more than 5 digits). - - Telephone Number: PII (Includes both landline and mobile numbers). - - Date of Birth: PII - - Place of Birth: PII - - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans). - - Medical Information: PII (Includes health records, prescriptions). - - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers). - - Employment Information: PII (Includes employment records, salary information). - - Insurance Information: PII (Includes policy numbers, claim information). - - Education Records: PII (Includes student records, transcripts). - - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses). - - Photographs or Videos: PII (Any media that can identify an individual). - - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK). - - Geolocation Data: PII (Includes GPS coordinates, location history). - - Vehicle Registration Numbers: PII - - Not PII: - - Some data may seem personally identifiable; however, it is not specific enough to identify an individual. - - - Standard 5-Digit Zip Code: Not PII - - City: Not PII - - State: Not PII - - Country: Not PII - - Age (in years): Not PII (Unless combined with other identifiers like date of birth). - - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth) - - Gender: Not PII - - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers). - - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records). - - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details). - - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers). - - Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer. - - - Reason: Indicate why you made the decision you did. - - Output: Please ensure that your output is JSON and matches the structure of the output examples provided. - - Example Input 1) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 4 - - Example Output 1) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.85, - "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list." - } - } - - Example Input 2) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Column No: 2 - - Example Output 2) - { - "Descriptor - Sensitive": { - "Is PII": false, - "Confidence Value": 0.90, - "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list." - } - } - - Example Input 3) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "employee", - "Column Details": [ - "Column No: 1 Named: employee_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: email of type: character varying(255) Column Comments: NA", - "Column No: 4 Named: salary of type: numeric Column Comments: NA" - ] - } - - Column No: 2 - - Example Output 3) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.95, - "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list." - } - } - - Example Input 4) - JSON Source Table Object: - { - "Schema Name": "public", - "Table Name": "order", - "Column Details": [ - "Column No: 1 Named: order_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: order_date of type: date Column Comments: NA", - "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'", - "Column No: 4 Named: total_amount of type: numeric Column Comments: NA" - ] - } - - Column No: 3 - - Example Output 4) - { - "Descriptor - Sensitive": { - "Is PII": true, - "Confidence Value": 0.98, - "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list." - } - } - - Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs. {hints} - - JSON Source Table Object: {new_json} - - Column No: {column_no} - - "#, - PromptTemplate::Test => r#"Why is the sky blue? Only respond in PROPER JSON FORMAT."#, - } - } -}