From 0907b7c5bf5256ced5eafe6484210576353fa97e Mon Sep 17 00:00:00 2001 From: analyzer1 Date: Tue, 10 Sep 2024 09:39:47 -0400 Subject: [PATCH] Feature: PADW-53 Accept New Tables (#6) * Accepting new tables: Incremental LLM requests focusing independently on business key attributes, naming, descriptor categorization (e.g., PII), and providing reasons with confidence scores to mitigate hallucinations and improve control. * Remove excessive logging * Enable transformer retries with hints if the appropriate JSON structure is not returned. --- extension/Cargo.toml | 1 + .../src/controller/bgw_transformer_client.rs | 311 ++++++++++--- extension/src/lib.rs | 2 +- extension/src/model/source_objects.rs | 40 +- extension/src/utility/ollama_client.rs | 418 ++++++++++++++---- 5 files changed, 635 insertions(+), 137 deletions(-) diff --git a/extension/Cargo.toml b/extension/Cargo.toml index 8eb9088..7c9f990 100644 --- a/extension/Cargo.toml +++ b/extension/Cargo.toml @@ -25,6 +25,7 @@ tokio = { version = "1", features = ["full"] } uuid = { version = "1.1", features = ["v4", "v5", "serde"] } chrono = { version = "0.4", features = ["serde"] } anyhow = "1.0" +regex = "1.7" [dev-dependencies] pgrx-tests = "=0.11.4" diff --git a/extension/src/controller/bgw_transformer_client.rs b/extension/src/controller/bgw_transformer_client.rs index 5069845..a9ccc03 100644 --- a/extension/src/controller/bgw_transformer_client.rs +++ b/extension/src/controller/bgw_transformer_client.rs @@ -2,13 +2,17 @@ use pgrx::bgworkers::*; use pgrx::prelude::*; use std::time::Duration; +use std::collections::HashMap; use tokio::runtime::Runtime; -use serde::de::DeserializeOwned; -use serde_json::from_value; +use serde::Deserialize; use crate::queries; use crate::model::source_objects; use crate::utility::ollama_client; +use crate::utility::guc; +use regex::Regex; + +const MAX_TRANSFORMER_RETRIES: u8 = 3; // TODO: Set in GUC #[pg_guard] #[no_mangle] @@ -20,10 +24,8 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { // Initialize Tokio runtime let runtime = Runtime::new().expect("Failed to create Tokio runtime"); - while BackgroundWorker::wait_latch(Some(Duration::from_secs(10))) { - // Load Prompts into Results let result: Result, pgrx::spi::Error> = BackgroundWorker::transaction(|| { Spi::connect(|client| { @@ -40,7 +42,6 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { table_column_links: table_column_links, table_details: table_details }; - v_source_table_prompts.push(source_table_prompt) } Ok(v_source_table_prompts) @@ -57,57 +58,209 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { let table_column_link_json_str = serde_json::to_string_pretty(&source_table_prompt.table_column_links).expect("Failed to convert JSON Column Links to pretty string"); let table_column_links_o: Option = serde_json::from_str(&table_column_link_json_str).ok(); + + let columns = extract_column_numbers(&table_details_json_str); + - // Define generation_json_o outside the runtime.block_on block - let mut generation_json_o: Option = None; - - // Run the async block - runtime.block_on(async { - // Get Generation - generation_json_o = match ollama_client::send_request(table_details_json_str.as_str()).await { - Ok(response_json) => { - // log!("Transformer client request successful. {:?}", response_json); - Some(response_json) - }, + + // Identity BK Ordinal Location + let mut generation_json_bk_identification: Option = None; + let mut identified_business_key_opt: Option = None; + let mut retries = 0; + let mut hints = String::new(); + while retries < MAX_TRANSFORMER_RETRIES { + runtime.block_on(async { + // Get Generation + generation_json_bk_identification = match ollama_client::send_request(table_details_json_str.as_str(), ollama_client::PromptTemplate::BKIdentification, &0, &hints).await { + Ok(mut response_json) => { + + // TODO: Add a function to enable logging. + // let response_json_pretty = serde_json::to_string_pretty(&response_json) + // .expect("Failed to convert Response JSON to Pretty String."); + Some(response_json) + }, + Err(e) => { + log!("Error in Ollama client request: {}", e); + hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt.", retries + 1); + None + } + }; + }); + // let identified_business_key: IdentifiedBusinessKey = serde_json::from_value(generation_json_bk_identification.unwrap()).expect("Not valid JSON"); + + match serde_json::from_value::(generation_json_bk_identification.clone().unwrap()) { + Ok(bk) => { + identified_business_key_opt = Some(bk); + break; // Successfully Decoded + } Err(e) => { - log!("Error in Ollama client request: {}", e); - None + log!("Error JSON JSON Structure not of type IdentifiedBusinessKey: {}", e); + hints = format!("Hint: Please ensure the correct JSON key pair structure is given. Previously you gave a response but it errored. Error: {e}. Please try again."); } - }; - }); + } + retries += 1; + } + + let identified_business_key = match identified_business_key_opt { + Some(bk) => bk, + None => panic!("Failed to identify business key after {} retries", retries), + }; + + // Identity BK Name + let mut generation_json_bk_name: Option = None; + let mut business_key_name_opt: Option = None; + let mut retries = 0; + let mut hints = String::new(); + while retries < MAX_TRANSFORMER_RETRIES { + runtime.block_on(async { + // Get Generation + generation_json_bk_name = match ollama_client::send_request(table_details_json_str.as_str(), ollama_client::PromptTemplate::BKName, &0, &hints).await { + Ok(mut response_json) => { + + // let response_json_pretty = serde_json::to_string_pretty(&response_json) + // .expect("Failed to convert Response JSON to Pretty String."); + Some(response_json) + }, + Err(e) => { + log!("Error in Ollama client request: {}", e); + hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt.", retries + 1); + None + } + }; + }); + + match serde_json::from_value::(generation_json_bk_name.clone().unwrap()) { + Ok(bk) => { + business_key_name_opt = Some(bk); + break; // Successfully Decoded + } + Err(e) => { + log!("Error JSON JSON Structure not of type BusinessKeyName: {}", e); + } + } + retries += 1; + } + + let business_key_name = match business_key_name_opt { + Some(bk) => bk, + None => panic!("Failed to identify business key name after {} retries", retries), + }; - let generation_table_detail_o: Option = deserialize_option(generation_json_o); + // Identity Descriptor - Sensitive + // let mut generation_json_descriptors_sensitive: HashMap<&u32, Option> = HashMap::new(); + let mut descriptors_sensitive: HashMap<&u32, DescriptorSensitive> = HashMap::new(); + let mut generation_json_descriptor_sensitive: Option = None; + for column in &columns { + let mut retries = 0; + let mut hints = String::new(); + while retries < MAX_TRANSFORMER_RETRIES { + // Run the async block + runtime.block_on(async { + // Get Generation + generation_json_descriptor_sensitive = + match ollama_client::send_request( + table_details_json_str.as_str(), + ollama_client::PromptTemplate::DescriptorSensitive, + column, + &hints).await { + Ok(mut response_json) => { + + // let response_json_pretty = serde_json::to_string_pretty(&response_json) + // .expect("Failed to convert Response JSON to Pretty String."); + + Some(response_json) + }, + Err(e) => { + log!("Error in Ollama client request: {}", e); + hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt.", retries + 1); + None + } + }; + // generation_json_descriptors_sensitive.insert(column, generation_json_descriptor_sensitive); + }); + + match serde_json::from_value::(generation_json_descriptor_sensitive.clone().unwrap()) { + Ok(des) => { + // business_key_name_opt = Some(des); + descriptors_sensitive.insert(column, des); + break; // Successfully Decoded + } + Err(e) => { + log!("Error JSON JSON Structure not of type DescriptorSensitive: {}", e); + } + } + + retries += 1; + } + } let table_column_links = table_column_links_o.unwrap(); - let generation_table_detail = generation_table_detail_o.unwrap(); - // Build the SQL INSERT statement + // Build the SQL INSERT statement let mut insert_sql = String::from("INSERT INTO auto_dw.transformer_responses (fk_source_objects, model_name, category, business_key_name, confidence_score, reason) VALUES "); - for (index, column_link) in table_column_links.column_links.iter().enumerate() { - - let not_last = index != table_column_links.column_links.len() - 1; - - let index_o = generation_table_detail.response_column_details.iter().position(|r| r.column_no == column_link.column_ordinal_position); - match index_o { - Some(index) => { - let column_detail = &generation_table_detail.response_column_details[index]; - - let category = &column_detail.category.replace("'", "''"); - let business_key_name = &column_detail.business_key_name.replace("'", "''"); - let confidence_score = &column_detail.confidence; - let reason = &column_detail.reason.replace("'", "''"); - let pk_source_objects = column_link.pk_source_objects; - - let model_name = "Mixtral"; - - if not_last { - insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, business_key_name, confidence_score, reason)); - } else { - insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, business_key_name, confidence_score, reason)); + for (index, column) in columns.iter().enumerate() { + + let last = {index == table_column_links.column_links.len() - 1}; + + if column == &identified_business_key.identified_business_key_values.column_no { + + let category = "Business Key Part"; + let confidence_score = identified_business_key.identified_business_key_values.confidence_value * business_key_name.business_key_name_values.confidence_value; + let bk_name = &business_key_name.business_key_name_values.name; + let bk_identified_reason = &identified_business_key.identified_business_key_values.reason; + let bk_name_reason = &business_key_name.business_key_name_values.reason; + let reason = format!("BK Identified Reason: {}, BK Naming Reason: {}", bk_identified_reason, bk_name_reason); + let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set."); + let model_name = model_name_owned.as_str(); + + let pk_source_objects: i32; + if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) { + pk_source_objects = pk_source_objects_temp; + } else { + println!("No match found for column_ordinal_position: {}", column); + panic!() + } + + if !last { + insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); + } else { + insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); + } + + } else { + + let pk_source_objects: i32; + let mut category = "Descriptor"; + let mut confidence_score: f64 = 1.0; + let bk_name = "NA"; + let mut reason = "Defaulted of category 'Descriptor' maintained.".to_string(); + let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set."); + let model_name = model_name_owned.as_str(); + + + if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) { + pk_source_objects = pk_source_objects_temp; + } else { + println!("No match found for column_ordinal_position: {}", column); + panic!() + } + + if let Some(descriptor_sensitive) = descriptors_sensitive.get(&column) { + if descriptor_sensitive.descriptor_sensitive_values.is_pii && (descriptor_sensitive.descriptor_sensitive_values.confidence_value > 0.5) { + category = "Descriptor - Sensitive"; + confidence_score = descriptor_sensitive.descriptor_sensitive_values.confidence_value; + reason = descriptor_sensitive.descriptor_sensitive_values.reason.clone(); } + } else { + log!("Teseting Can't find a response for {} in Descriptors Sensitive Hashmap.", column); + } + + if !last { + insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); + } else { + insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); } - None => {break;} } } @@ -117,16 +270,66 @@ pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { _ = client.update(insert_sql.as_str(), None, None); }) }); - } + } } } -fn deserialize_option(json_option: Option) -> Option -where - T: DeserializeOwned -{ - json_option.and_then(|json| { - from_value::(json).ok() - }) -} \ No newline at end of file +fn extract_column_numbers(json_str: &str) -> Vec { + // Define a regex to capture the column numbers + let re = Regex::new(r"Column No: (\d+)").expect("Invalid regex"); + + // Find all matches and collect the column numbers + re.captures_iter(json_str) + .filter_map(|caps| caps.get(1).map(|m| m.as_str().parse::().unwrap())) + .collect() +} + +#[derive(Deserialize, Debug)] +struct IdentifiedBusinessKey { + #[serde(rename = "Identified Business Key")] + identified_business_key_values: IdentifiedBusinessKeyValues, +} + +#[derive(Deserialize, Debug)] +struct IdentifiedBusinessKeyValues { + #[serde(rename = "Column No")] + column_no: u32, + #[serde(rename = "Confidence Value")] + confidence_value: f64, + #[serde(rename = "Reason")] + reason: String, +} + +#[derive(Deserialize, Debug)] +struct BusinessKeyName { + #[serde(rename = "Business Key Name")] + business_key_name_values: BusinessKeyNameValues, +} + +#[derive(Deserialize, Debug)] +struct BusinessKeyNameValues { + #[serde(rename = "Name")] + name: String, + #[serde(rename = "Confidence Value")] + confidence_value: f64, + #[serde(rename = "Reason")] + reason: String, +} + +#[derive(Deserialize, Debug)] +struct DescriptorSensitive { + #[serde(rename = "Descriptor - Sensitive")] + descriptor_sensitive_values: DescriptorSensitiveValues, +} + +#[derive(Deserialize, Debug)] +struct DescriptorSensitiveValues { + #[serde(rename = "Is PII")] + is_pii: bool, + #[serde(rename = "Confidence Value")] + confidence_value: f64, + #[serde(rename = "Reason")] + reason: String, +} + diff --git a/extension/src/lib.rs b/extension/src/lib.rs index 08653b2..7755d79 100644 --- a/extension/src/lib.rs +++ b/extension/src/lib.rs @@ -49,7 +49,7 @@ fn source_include( schema_pattern_include: &str, } #[pg_extern] -fn source_exlude( schema_pattern_exclude: &str, +fn source_exclude( schema_pattern_exclude: &str, table_pattern_exclude: default!(Option<&str>, "NULL"), column_pattern_exclude: default!(Option<&str>, "NULL")) -> &'static str { let schema_pattern_include: &str = "a^"; diff --git a/extension/src/model/source_objects.rs b/extension/src/model/source_objects.rs index 5a7e9bb..e9334d2 100644 --- a/extension/src/model/source_objects.rs +++ b/extension/src/model/source_objects.rs @@ -1,5 +1,5 @@ use pgrx::Json as JsonValue; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize}; #[derive(Debug)] pub struct SourceTablePrompt { @@ -9,6 +9,18 @@ pub struct SourceTablePrompt { pub table_details: JsonValue, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct SourceTableDetail { + #[serde(rename = "Schema Name")] + pub schema_name: String, + + #[serde(rename = "Table Name")] + pub table_name: String, + + #[serde(rename = "Column Details")] + pub column_details: Vec, +} + #[derive(Debug, Serialize, Deserialize)] pub struct Response { #[serde(rename = "Table ID")] @@ -17,11 +29,11 @@ pub struct Response { pub generation: GenerationTableDetail, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] pub struct GenerationColumnDetail { #[serde(rename = "Category")] pub category: String, - #[serde(rename = "Business Key Name")] + #[serde(rename = "Business Key Name", deserialize_with = "replace_spaces_with_underscores")] pub business_key_name: String, #[serde(rename = "Column No")] pub column_no: i32, @@ -31,7 +43,7 @@ pub struct GenerationColumnDetail { pub reason: String, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] pub struct GenerationTableDetail { #[serde(rename = "Schema Name")] pub schema_name: String, @@ -53,4 +65,24 @@ pub struct ColumnLink { pub struct TableLinks { #[serde(rename = "Column Links")] pub column_links: Vec, +} + +impl TableLinks { + // Method to find the pk_source_objects based on column_ordinal_position + pub fn find_pk_source_objects(&self, search_position: i32) -> Option { + for link in &self.column_links { + if link.column_ordinal_position == search_position { + return Some(link.pk_source_objects); + } + } + None + } +} + +fn replace_spaces_with_underscores<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + Ok(s.replace(' ', "_")) } \ No newline at end of file diff --git a/extension/src/utility/ollama_client.rs b/extension/src/utility/ollama_client.rs index 62faa97..383a9be 100644 --- a/extension/src/utility/ollama_client.rs +++ b/extension/src/utility/ollama_client.rs @@ -10,6 +10,12 @@ pub struct GenerateRequest { pub prompt: String, pub format: String, pub stream: bool, + pub options: Options, +} + +#[derive(Serialize, Debug)] +pub struct Options { + pub temperature: f64, } #[derive(Deserialize, Debug)] @@ -21,96 +27,35 @@ pub struct GenerateResponse { pub done: bool, } -pub async fn send_request(new_json: &str) -> Result> { +pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str) -> Result> { - let client = ClientBuilder::new().timeout(Duration::from_secs(90)).build()?; // 30 sec Default to short for some LLMS. + let client = ClientBuilder::new().timeout(Duration::from_secs(180)).build()?; // 30 sec Default to short for some LLMS. - let prompt_template = r#" - You are given a JSON object containing the schema name, table name, and details about each column in a table. This is source table information and downstream we are creating data vault tables. In this case, we are focusing on categorization for building hubs and satellites. Your task is to categorize each column into one of three categories: "Business Key Part," "Descriptor," or "Descriptor - Sensitive." The response should include the column number, the category type, and a confidence score for each categorization. - - Additionally, you should also include the business key name assoicated with the attributes that are part of the busines key, i.e. category "Business Key Part." The Business Key Name should be derived from the table name and the attributes associated with the business key parts, and it should reflect the entity described by the table. The Business Key Name should not include terms like "ID," "number," or similar identifiers. It should express the business entity associated with the table, such as "customer," "employee," or "seller." The Business Key Name should also not include terms like "Entity" or similar suffixes, and should only reflect the core business entity name. - - Hard Rules: - - If the column is a primary key, set "Category" to "Business Key Part." - - Example Input: - - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", - "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", - "Column No: 3 Named: state of type: character(2) Column Comments: NA", - "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" - ] - } - - Expected Output: - - { - "Schema Name": "public", - "Table Name": "customer", - "Column Details": [ - { - "Column No": 1, - "Category": "Business Key Part", - "Business Key Name: "customer", - "Confidence": 0.99, - "Reason": "The column 'customer_id' is a primary key, which is a strong indicator of a Business Key." - }, - { - "Column No": 2, - "Category": "Descriptor", - "Business Key Name: "NA", - "Confidence": 0.85, - "Reason": "The column 'city' provides general descriptive information about the customer, which is typically a Descriptor." - }, - { - "Column No": 3, - "Category": "Descriptor", - "Business Key Name: "NA", - "Confidence": 0.80, - "Reason": "The column 'state' provides general descriptive information and is less likely to be sensitive, hence categorized as a Descriptor." - }, - { - "Column No": 4, - "Category": "Descriptor - Sensitive", - "Business Key Name: "NA", - "Confidence": 0.90, - "Reason": "The column 'zip' contains potentially sensitive information about the customer's location, which requires careful handling." - } - ] - } + let prompt_template = template_type.template(); - New JSON to Consider: - - {new_json} - - Categorize the columns in the new JSON according to the following categories only: - - "Business Key Part": Identifiers or primary keys. - - "Descriptor": General descriptive attributes. - - "Descriptor - Sensitive": Sensitive information that needs to be handled with care. - - If you have qualifiers, like it would be a "Descriptor - Sensitive" if this case is true. Lower your confidence score and include that in the reasons. - - Hard Rule: Only categories into the 3 categories listed above. - - Return the output JSON with the column number, the category type, a confidence score, and reason for each column. Plus, if the category is a business key part, provide a business key name at the attribute level. The business key name should be derived from the table name and the attributes associated with the business key parts. The name should exclude terms like "ID," "number," and "Entity," and reflecting only the core business entity name. If the category is not a business key part specify "Business Key Name: "NA" as the example above shows. - "#; - - // Inject new_json into the prompt_template - let prompt = prompt_template.replace("{new_json}", new_json); + // Inject new_json into the prompt_template' + let column_number = col.to_string(); + let prompt = prompt_template + .replace("{new_json}", new_json) + .replace("{column_no}", &column_number) + .replace("{hints}", &hints); // GUC Values for the transformer server let transformer_server_url = guc::get_guc(guc::PgAutoDWGuc::TransformerServerUrl).ok_or("GUC: Transformer Server URL is not set")?; let model = guc::get_guc(guc::PgAutoDWGuc::Model).ok_or("MODEL GUC is not set.")?; + let temperature: f64 = 0.75; + + let options: Options = Options{ + temperature, + }; + let request = GenerateRequest { model, prompt, format: "json".to_string(), stream: false, + options, }; let response = client @@ -125,4 +70,321 @@ pub async fn send_request(new_json: &str) -> Result &str { + match self { + PromptTemplate::BKIdentification => r#" + Task Title: Business Key Identification in JSON Source Table Object + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. + + Requested Task: Identify the column number most likely to serve as the business key. Return only one column in JSON format as specified below. + + + Request Details: + If the column is a primary key, assume it is the business key. If not, choose the column most likely to uniquely identify the table’s entity. Additionally, provide a confidence value for your selection. + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in the selected column. A value of 0.80 or higher is considered reasonably confident. + + + Reason: Indicate why you made the decision you did. + + Output: Ensure the output conforms to the format shown in the examples below. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Example Output 1) + { + "Identified Business Key": { + "Column No": 1, + "Confidence Value": 0.95, + "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "sales", + "Table Name": "order_details", + "Column Details": [ + "Column No: 1 Named: order_id of type: integer Column Comments: NA", + "Column No: 2 Named: product_id of type: integer Column Comments: NA", + "Column No: 3 Named: quantity of type: integer Column Comments: NA", + "Column No: 4 Named: order_date of type: date Column Comments: NA" + ] + } + + Example Output 2) + { + "Identified Business Key": { + "Column No": 1, + "Confidence Value": 0.75, + "Reason": "Although 'order_id' is not explicitly marked as a primary key, it is likely to uniquely identify each order, making it a strong candidate for the business key." + } + } + + Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} + + JSON Source Table Object: {new_json} + "#, + PromptTemplate::BKName => r#" + Task Title: Business Key Naming in JSON Source Table Object with specified Column + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. + + Requested Task: Identify the business key name. The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object. Return a name that best represents the business key from a data vault perspective. + + Request Details: + + The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name. + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident. + + + Reason: Indicate why you made the decision you did. + + Output: Ensure the output conforms to the format shown in the examples below. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 1 + + Example Output 1) + { + "Business Key Name": { + "Name": "Customer", + "Confidence Value": 0.9, + "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "sales", + "Table Name": "order_details", + "Column Details": [ + "Column No: 1 Named: id of type: integer Column Comments: NA", + "Column No: 2 Named: product_id of type: integer Column Comments: NA", + "Column No: 3 Named: quantity of type: integer Column Comments: NA", + "Column No: 4 Named: order_date of type: date Column Comments: NA" + ] + } + + Column No: 1 + + Example Output 2) + { + "Business Key Name": { + "Name": "Order", + "Confidence Value": 0.85, + "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name." + } + } + + Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} + + JSON Source Table Object: {new_json} + + Column No: {column_no} + "#, + PromptTemplate::DescriptorSensitive => r#" + Task Title: Identification of PII in JSON Source Table Object + + You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information. + + Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.” + + Request Details: + PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII. + No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list. + + Personal Identifiable Information (PII) List: + + Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”: + + - Person’s Name: PII (Includes first name, last name, or both). + - Social Security Number (SSN): PII + - Driver’s License Number: PII + - Passport Number: PII + - Email Address: PII + - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code). + - Extended Zip Code: PII (Any Zip code with more than 5 digits). + - Telephone Number: PII (Includes both landline and mobile numbers). + - Date of Birth: PII + - Place of Birth: PII + - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans). + - Medical Information: PII (Includes health records, prescriptions). + - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers). + - Employment Information: PII (Includes employment records, salary information). + - Insurance Information: PII (Includes policy numbers, claim information). + - Education Records: PII (Includes student records, transcripts). + - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses). + - Photographs or Videos: PII (Any media that can identify an individual). + - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK). + - Geolocation Data: PII (Includes GPS coordinates, location history). + - Vehicle Registration Numbers: PII + + Not PII: + + Some data may seem personally identifiable; however, it is not specific enough to identify an individual. + + - Standard 5-Digit Zip Code: Not PII + - City: Not PII + - State: Not PII + - Country: Not PII + - Age (in years): Not PII (Unless combined with other identifiers like date of birth). + - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth) + - Gender: Not PII + - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers). + - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records). + - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details). + - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers). + + Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer. + + + Reason: Indicate why you made the decision you did. + + Output: Please ensure that your output is JSON and matches the structure of the output examples provided. + + Example Input 1) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 4 + + Example Output 1) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.85, + "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list." + } + } + + Example Input 2) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "customer", + "Column Details": [ + "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: state of type: character(2) Column Comments: NA", + "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" + ] + } + + Column No: 2 + + Example Output 2) + { + "Descriptor - Sensitive": { + "Is PII": false, + "Confidence Value": 0.90, + "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list." + } + } + + Example Input 3) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "employee", + "Column Details": [ + "Column No: 1 Named: employee_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA", + "Column No: 3 Named: email of type: character varying(255) Column Comments: NA", + "Column No: 4 Named: salary of type: numeric Column Comments: NA" + ] + } + + Column No: 2 + + Example Output 3) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.95, + "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list." + } + } + + Example Input 4) + JSON Source Table Object: + { + "Schema Name": "public", + "Table Name": "order", + "Column Details": [ + "Column No: 1 Named: order_id of type: uuid And is a primary key. Column Comments: NA", + "Column No: 2 Named: order_date of type: date Column Comments: NA", + "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'", + "Column No: 4 Named: total_amount of type: numeric Column Comments: NA" + ] + } + + Column No: 3 + + Example Output 4) + { + "Descriptor - Sensitive": { + "Is PII": true, + "Confidence Value": 0.98, + "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list." + } + } + + Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs. {hints} + + JSON Source Table Object: {new_json} + + Column No: {column_no} + + "#, + } + } +} + +