diff --git a/lakehouse_engine/core/definitions.html b/lakehouse_engine/core/definitions.html index 7b2d8f2..5817fc5 100644 --- a/lakehouse_engine/core/definitions.html +++ b/lakehouse_engine/core/definitions.html @@ -6407,7 +6407,7 @@
Inherited Members
COMBINED_CONFIGURATION = - <GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n calendar_date as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'QUARTER', 'DAY', 'MONTH', 'YEAR'}, 'week_start': 'M', 'snap_flag': {'N', 'Y'}, 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n case\n when '${config_week_start}' = 'Monday' then weekend_mon\n when '${config_week_start}' = 'Sunday' then weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct month_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_mon\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_sun\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n month_end as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 13: {'cadence': 'QUARTER', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\n )\n )-1\n "}, 17: {'cadence': 'YEAR', 'recon': {'QUARTER', 'DAY', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when '${rec_cadence}' = 'DAY' then calendar_date\n when '${rec_cadence}' = 'MONTH' then month_end\n when '${rec_cadence}' = 'QUARTER' then quarter_end\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': "\n select distinct\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekstart_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekstart_sun\n else\n date(date_trunc('${cad}',calendar_date))\n end as cadence_start_date,\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekend_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekend_sun\n when '${cad}' = 'DAY'\n then date(date_trunc('${cad}',calendar_date))\n when '${cad}' = 'MONTH'\n then date(\n date_trunc(\n 'MONTH',\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\n )\n )-1\n when '${cad}' = 'QUARTER'\n then date(\n date_trunc(\n 'QUARTER',\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\n )\n )-1\n when '${cad}' = 'YEAR'\n then date(\n date_trunc(\n 'YEAR',\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\n )\n )-1\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}> + <GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n calendar_date as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'YEAR', 'DAY', 'MONTH', 'QUARTER'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n case\n when '${config_week_start}' = 'Monday' then weekend_mon\n when '${config_week_start}' = 'Sunday' then weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct month_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_mon\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_sun\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n month_end as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 13: {'cadence': 'QUARTER', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\n )\n )-1\n "}, 17: {'cadence': 'YEAR', 'recon': {'DAY', 'MONTH', 'QUARTER'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when '${rec_cadence}' = 'DAY' then calendar_date\n when '${rec_cadence}' = 'MONTH' then month_end\n when '${rec_cadence}' = 'QUARTER' then quarter_end\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekstart_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekstart_sun\n else\n date(date_trunc('${cad}',calendar_date))\n end as cadence_start_date,\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekend_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekend_sun\n when '${cad}' = 'DAY'\n then date(date_trunc('${cad}',calendar_date))\n when '${cad}' = 'MONTH'\n then date(\n date_trunc(\n 'MONTH',\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\n )\n )-1\n when '${cad}' = 'QUARTER'\n then date(\n date_trunc(\n 'QUARTER',\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\n )\n )-1\n when '${cad}' = 'YEAR'\n then date(\n date_trunc(\n 'YEAR',\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\n )\n )-1\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>
diff --git a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html index a4f2a9d..8f9f158 100644 --- a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html @@ -688,7 +688,7 @@
Inherited Members
- JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617164531', max_timestamp_custom_schema: Optional[str] = None) + JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617173044', max_timestamp_custom_schema: Optional[str] = None)
diff --git a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html index db96f10..cbf5778 100644 --- a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html @@ -556,7 +556,7 @@
Inherited Members
- SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617164531', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000') + SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617173044', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000')
diff --git a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html index 6ff64f2..950f125 100644 --- a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html @@ -530,7 +530,7 @@

- SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617164531', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000') + SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20240617173044', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000')
diff --git a/search.js b/search.js index f79efa9..4fefc10 100644 --- a/search.js +++ b/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oLakehouse engine package containing all the system subpackages.

\n\n\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": "

Package containing all the lakehouse engine algorithms.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "

Module containing the Algorithm class.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "

Class to define the behavior of every algorithm based on ACONs.

\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "

Get data quality specification object from acon.

\n\n
Arguments:
\n\n
    \n
  • spec: data quality specifications.
  • \n
\n\n
Returns:
\n\n
\n

The DQSpec and the List of DQ Functions Specs.

\n
\n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "

Module to define DataLoader class.

\n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "

Load data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.

\n\n

Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.

\n\n

As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "

Construct DataLoader algorithm instances.

\n\n

A data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:

\n\n
    \n
  • input specifications (mandatory): specify how to read data.
  • \n
  • transform specifications (optional): specify how to transform data.
  • \n
  • data quality specifications (optional): specify how to execute the data\nquality process.
  • \n
  • output specifications (mandatory): specify how to write data to the\ntarget.
  • \n
  • terminate specifications (optional): specify what to do after writing into\nthe target (e.g., optimizing target table, vacuum, compute stats, etc).
  • \n
\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n
Returns:
\n\n
\n

An ordered dict with all the dataframes that were read.

\n
\n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "

Transform (optionally) the data that was read.

\n\n

If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.

\n\n
Arguments:
\n\n
    \n
  • data: input dataframes in an ordered dict.
  • \n
\n\n
Returns:
\n\n
\n

Another ordered dict with the transformed dataframes, according to the\n transformation specification.

\n
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read and/or transformed.

\n\n

It supports multiple input dataframes. Although just one is advisable.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n
Arguments:
\n\n
    \n
  • data: dataframes from previous steps of the algorithm that we which to\nrun the DQ process on.
  • \n
\n\n
Returns:
\n\n
\n

Another ordered dict with the validated dataframes.

\n
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "

Write the data that was read and transformed (if applicable).

\n\n

It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.

\n\n
Arguments:
\n\n
    \n
  • data: dataframes that were read and transformed (if applicable).
  • \n
\n\n
Returns:
\n\n
\n

Dataframes that were written.

\n
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "

Terminate the algorithm.

\n\n
Arguments:
\n\n
    \n
  • data: dataframes that were written.
  • \n
\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "

Module to define Data Validator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "

Validate data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "

Construct DQValidator algorithm instances.

\n\n

A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.

\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n
Returns:
\n\n
\n

Dataframe with data that was read.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read.

\n\n

It supports a single input dataframe.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n
Arguments:
\n\n
    \n
  • data: input dataframe on which to run the DQ process.
  • \n
\n\n
Returns:
\n\n
\n

Validated dataframe.

\n
\n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "

Package defining all the algorithm custom exceptions.

\n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "

Exception for when the reconciliation process fails.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "

Exception for when no new data is available.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "

Exception for when a sensor with same sensor id already exists.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "

Exception for when the restore type is not found.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.gab", "modulename": "lakehouse_engine.algorithms.gab", "kind": "module", "doc": "

Module to define Gold Asset Builder algorithm behavior.

\n"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB", "kind": "class", "doc": "

Class representing the gold asset builder.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB.__init__", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB.__init__", "kind": "function", "doc": "

Construct GAB instances.

\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB.execute", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB.execute", "kind": "function", "doc": "

Execute the Gold Asset Builder.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "

Module containing the Reconciliator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "

Type of Reconciliation.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "

Transformers Available for the Reconciliation Algorithm.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "

\n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "

Class to define the behavior of an algorithm that checks if data reconciles.

\n\n

Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.

\n\n

The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).

\n\n

All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.

\n\n
\n\n

It is crucial that both the current and truth datasets have exactly the same\nstructure.

\n\n
\n\n
\n\n

You should not use 0 as yellow or red threshold, as the algorithm will verify\nif the difference between the truth and current values is bigger\nor equal than those thresholds.

\n\n
\n\n
\n\n

The reconciliation does not produce any negative values or percentages, as we\nuse the absolute value of the differences. This means that the recon result\nwill not indicate if it was the current values that were bigger or smaller\nthan the truth values, or vice versa.

\n\n
\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "

Get the source of truth (expected result) for the reconciliation process.

\n\n
Returns:
\n\n
\n

DataFrame containing the source of truth.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "

Get the current results from the table that we are checking if it reconciles.

\n\n
Returns:
\n\n
\n

DataFrame containing the current results.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "

Reconcile the current results against the truth dataset.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "

Module to define Sensor algorithm behavior.

\n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "

Class representing a sensor to check if the upstream has new data.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "

Construct Sensor instances.

\n\n
Arguments:
\n\n
    \n
  • acon: algorithm configuration.
  • \n
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "

Execute the sensor.

\n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "

This module receives a config file which is included in the wheel.

\n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "

Package with the core behaviour of the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager", "modulename": "lakehouse_engine.core.dbfs_file_manager", "kind": "module", "doc": "

File manager module using dbfs.

\n"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager", "kind": "class", "doc": "

Set of actions to manipulate dbfs files in several ways.

\n", "bases": "lakehouse_engine.core.file_manager.FileManager"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.get_function", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.delete_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.delete_objects", "kind": "function", "doc": "

Delete objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.copy_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.copy_objects", "kind": "function", "doc": "

Copies objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.move_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.move_objects", "kind": "function", "doc": "

Moves objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "

Definitions of standard values and structures for core components.

\n"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage", "kind": "class", "doc": "

Options for collecting engine usage stats.

\n\n
    \n
  • enabled, enables the collection and storage of Lakehouse Engine\nusage statistics for any environment.
  • \n
  • prod_only, enables the collection and storage of Lakehouse Engine\nusage statistics for production environment only.
  • \n
  • disabled, disables the collection and storage of Lakehouse Engine\nusage statistics, for all environments.
  • \n
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.ENABLED", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.ENABLED", "kind": "variable", "doc": "

\n", "default_value": "<CollectEngineUsage.ENABLED: 'enabled'>"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.PROD_ONLY", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.PROD_ONLY", "kind": "variable", "doc": "

\n", "default_value": "<CollectEngineUsage.PROD_ONLY: 'prod_only'>"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.DISABLED", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.DISABLED", "kind": "variable", "doc": "

\n", "default_value": "<CollectEngineUsage.DISABLED: 'disabled'>"}, {"fullname": "lakehouse_engine.core.definitions.EngineConfig", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineConfig", "kind": "class", "doc": "

Definitions that can come from the Engine Config file.

\n\n
    \n
  • dq_bucket: S3 bucket used to store data quality related artifacts.
  • \n
  • notif_disallowed_email_servers: email servers not allowed to be used\nfor sending notifications.
  • \n
  • engine_usage_path: path where the engine prod usage stats are stored.
  • \n
  • engine_dev_usage_path: path where the engine dev usage stats are stored.
  • \n
  • collect_engine_usage: whether to enable the collection of lakehouse\nengine usage stats or not.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.EngineConfig.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineConfig.__init__", "kind": "function", "doc": "

\n", "signature": "(\tdq_bucket: Optional[str] = None,\tnotif_disallowed_email_servers: Optional[list] = None,\tengine_usage_path: Optional[str] = None,\tengine_dev_usage_path: Optional[str] = None,\tcollect_engine_usage: str = 'enabled')"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats", "kind": "class", "doc": "

Definitions for collection of Lakehouse Engine Stats.

\n\n
\n\n

Note: whenever the value comes from a key inside a Spark Config\nthat returns an array, it can be specified with a '#' so that it\nis adequately processed.

\n\n
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats.CLUSTER_USAGE_TAGS", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats.CLUSTER_USAGE_TAGS", "kind": "variable", "doc": "

\n", "default_value": "<EngineStats.CLUSTER_USAGE_TAGS: 'spark.databricks.clusterUsageTags'>"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats.DEF_SPARK_CONFS", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats.DEF_SPARK_CONFS", "kind": "variable", "doc": "

\n", "default_value": "<EngineStats.DEF_SPARK_CONFS: {'dp_name': 'spark.databricks.clusterUsageTags.clusterAllTags#accountName', 'environment': 'spark.databricks.clusterUsageTags.clusterAllTags#environment', 'workspace_id': 'spark.databricks.clusterUsageTags.orgId', 'job_id': 'spark.databricks.clusterUsageTags.clusterAllTags#JobId', 'job_name': 'spark.databricks.clusterUsageTags.clusterAllTags#RunName', 'run_id': 'spark.databricks.clusterUsageTags.clusterAllTags#ClusterName'}>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n
Return:
\n\n
\n

A list with all enum values.

\n
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "

Checks if the input format exists in the enum values.

\n\n
Arguments:
\n\n
    \n
  • input_format: format to check if exists.
  • \n
\n\n
Return:
\n\n
\n

If the input format exists in our enum.

\n
\n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "

Formats of algorithm output.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "

Type of notifier available.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "

\n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "

Parameters to be replaced in runtime.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "

Define the types of read operations.

\n\n
    \n
  • BATCH - read the data in batch mode (e.g., Spark batch).
  • \n
  • STREAMING - read the data in streaming mode (e.g., Spark streaming).
  • \n
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "

Different modes that control how we handle compliance to the provided schema.

\n\n

These read modes map to Spark's read modes at the moment.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "

Defaults used on the data quality process.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "

Types of write operations.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "

Specification of an algorithm input.

\n\n

This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).

\n\n
    \n
  • spec_id: spec_id of the input specification read_type: ReadType type of read\noperation.
  • \n
  • data_format: format of the input.
  • \n
  • sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\ndirectory.
  • \n
  • df_name: dataframe name.
  • \n
  • db_table: table name in the form of <db>.<table>.
  • \n
  • location: uri that identifies from where to read data in the specified format.
  • \n
  • enforce_schema_from_table: if we want to enforce the table schema or not, by\nproviding a table name in the form of <db>.<table>.
  • \n
  • query: sql query to execute and return the dataframe. Use it if you do not want to\nread from a file system nor from a table, but rather from a sql query instead.
  • \n
  • schema: dict representation of a schema of the input (e.g., Spark struct type\nschema).
  • \n
  • schema_path: path to a file with a representation of a schema of the input (e.g.,\nSpark struct type schema).
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
  • with_filepath: if we want to include the path of the file that is being read. Only\nworks with the file reader (batch and streaming modes are supported).
  • \n
  • options: dict with other relevant options according to the execution\nenvironment (e.g., spark) possible sources.
  • \n
  • calculate_upper_bound: when to calculate upper bound to extract from SAP BW\nor not.
  • \n
  • calc_upper_bound_schema: specific schema for the calculated upper_bound.
  • \n
  • generate_predicates: when to generate predicates to extract from SAP BW or not.
  • \n
  • predicates_add_null: if we want to include is null on partition by predicates.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\tdisable_dbfs_retry: bool = False,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "

Transformer Specification, i.e., a single transformation amongst many.

\n\n
    \n
  • function: name of the function (or callable function) to be executed.
  • \n
  • args: (not applicable if using a callable function) dict with the arguments\nto pass to the function <k,v> pairs with the name of the parameter of\nthe function and the respective value.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "

Transformation Specification.

\n\n

I.e., the specification that defines the many transformations to be done to the data\nthat was read.

\n\n
    \n
  • spec_id: id of the terminate specification
  • \n
  • input_id: id of the corresponding input\nspecification.
  • \n
  • transformers: list of transformers to execute.
  • \n
  • force_streaming_foreach_batch_processing: sometimes, when using streaming, we want\nto force the transform to be executed in the foreachBatch function to ensure\nnon-supported streaming operations can be properly executed.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "

Available data quality tasks.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "

\n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "

Defines a data quality function specification.

\n\n
    \n
  • function - name of the data quality function (expectation) to execute.\nIt follows the great_expectations api https://greatexpectations.io/expectations/.
  • \n
  • args - args of the function (expectation). Follow the same api as above.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "

Data quality overall specification.

\n\n
    \n
  • spec_id - id of the specification.
  • \n
  • input_id - id of the input specification.
  • \n
  • dq_type - type of DQ process to execute (e.g. validator).
  • \n
  • dq_functions - list of function specifications to execute.
  • \n
  • unexpected_rows_pk - the list of columns composing the primary key of the\nsource data to identify the rows failing the DQ validations. Note: only one\nof tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\nis mandatory to provide one of these arguments when using tag_source_data\nas True. When tag_source_data is False, this is not mandatory, but still\nrecommended.
  • \n
  • tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\nNote: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\nbe provided. It is mandatory to provide one of these arguments when using\ntag_source_data as True. hen tag_source_data is False, this is not\nmandatory, but still recommended.
  • \n
  • gx_result_format - great expectations result format. Default: \"COMPLETE\".
  • \n
  • tag_source_data - when set to true, this will ensure that the DQ process ends by\ntagging the source data with an additional column with information about the\nDQ results. This column makes it possible to identify if the DQ run was\nsucceeded in general and, if not, it unlocks the insights to know what\nspecific rows have made the DQ validations fail and why. Default: False.\nNote: it only works if result_sink_explode is True, gx_result_format is\nCOMPLETE, fail_on_error is False (which is done automatically when\nyou specify tag_source_data as True) and tbl_to_derive_pk or\nunexpected_rows_pk is configured.
  • \n
  • store_backend - which store_backend to use (e.g. s3 or file_system).
  • \n
  • local_fs_root_dir - path of the root directory. Note: only applicable for\nstore_backend file_system.
  • \n
  • data_docs_local_fs - the path for data docs only for store_backend\nfile_system.
  • \n
  • bucket - the bucket name to consider for the store_backend (store DQ artefacts).\nNote: only applicable for store_backend s3.
  • \n
  • data_docs_bucket - the bucket name for data docs only. When defined, it will\nsupersede bucket parameter. Note: only applicable for store_backend s3.
  • \n
  • expectations_store_prefix - prefix where to store expectations' data. Note: only\napplicable for store_backend s3.
  • \n
  • validations_store_prefix - prefix where to store validations' data. Note: only\napplicable for store_backend s3.
  • \n
  • data_docs_prefix - prefix where to store data_docs' data.
  • \n
  • checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\napplicable for store_backend s3.
  • \n
  • data_asset_name - name of the data asset to consider when configuring the great\nexpectations' data source.
  • \n
  • expectation_suite_name - name to consider for great expectations' suite.
  • \n
  • result_sink_db_table - db.table_name indicating the database and table in which\nto save the results of the DQ process.
  • \n
  • result_sink_location - file system location in which to save the results of the\nDQ process.
  • \n
  • result_sink_partitions - the list of partitions to consider.
  • \n
  • result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
  • \n
  • result_sink_options - extra spark options for configuring the result sink.\nE.g: can be used to configure a Kafka sink if result_sink_format is kafka.
  • \n
  • result_sink_explode - flag to determine if the output table/location should have\nthe columns exploded (as True) or not (as False). Default: True.
  • \n
  • result_sink_extra_columns - list of extra columns to be exploded (following\nthe pattern \".*\") or columns to be selected. It is only used when\nresult_sink_explode is set to True.
  • \n
  • source - name of data source, to be easier to identify in analysis. If not\nspecified, it is set as default . This will be only used\nwhen result_sink_explode is set to True.
  • \n
  • fail_on_error - whether to fail the algorithm if the validations of your data in\nthe DQ process failed.
  • \n
  • cache_df - whether to cache the dataframe before running the DQ process or not.
  • \n
  • critical_functions - functions that should not fail. When this argument is\ndefined, fail_on_error is nullified.
  • \n
  • max_percentage_failure - percentage of failure that should be allowed.\nThis argument has priority over both fail_on_error and critical_functions.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tdata_docs_local_fs: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": "

Options for a merge operation.

\n\n
    \n
  • merge_predicate: predicate to apply to the merge operation so that we can\ncheck if a new record corresponds to a record already included in the\nhistorical data.
  • \n
  • insert_only: indicates if the merge should only insert data (e.g., deduplicate\nscenarios).
  • \n
  • delete_predicate: predicate to apply to the delete operation.
  • \n
  • update_predicate: predicate to apply to the update operation.
  • \n
  • insert_predicate: predicate to apply to the insert operation.
  • \n
  • update_column_set: rules to apply to the update operation which allows to\nset the value for each column to be updated.\n(e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )
  • \n
  • insert_column_set: rules to apply to the insert operation which allows to\nset the value for each column to be inserted.\n(e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "

Specification of an algorithm output.

\n\n

This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).

\n\n
    \n
  • spec_id: id of the output specification.
  • \n
  • input_id: id of the corresponding input specification.
  • \n
  • write_type: type of write operation.
  • \n
  • data_format: format of the output. Defaults to DELTA.
  • \n
  • db_table: table name in the form of <db>.<table>.
  • \n
  • location: uri that identifies from where to write data in the specified format.
  • \n
  • partitions: list of partition input_col names.
  • \n
  • merge_opts: options to apply to the merge operation.
  • \n
  • streaming_micro_batch_transformers: transformers to invoke for each streaming\nmicro batch, before writing (i.e., in Spark's foreachBatch structured\nstreaming function). Note: the lakehouse engine manages this for you, so\nyou don't have to manually specify streaming transformations here, so we don't\nadvise you to manually specify transformations through this parameter. Supply\nthem as regular transformers in the transform_specs sections of an ACON.
  • \n
  • streaming_once: if the streaming query is to be executed just once, or not,\ngenerating just one micro batch.
  • \n
  • streaming_processing_time: if streaming query is to be kept alive, this indicates\nthe processing time of each micro batch.
  • \n
  • streaming_available_now: if set to True, set a trigger that processes all\navailable data in multiple batches then terminates the query.\nWhen using streaming, this is the default trigger that the lakehouse-engine will\nuse, unless you configure a different one.
  • \n
  • streaming_continuous: set a trigger that runs a continuous query with a given\ncheckpoint interval.
  • \n
  • streaming_await_termination: whether to wait (True) for the termination of the\nstreaming query (e.g. timeout or exception) or not (False). Default: True.
  • \n
  • streaming_await_termination_timeout: a timeout to set to the\nstreaming_await_termination. Default: None.
  • \n
  • with_batch_id: whether to include the streaming batch id in the final data,\nor not. It only takes effect in streaming mode.
  • \n
  • options: dict with other relevant options according to the execution environment\n(e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\nstreaming, etc.
  • \n
  • streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\nbut for the DQ functions to be executed. Used internally by the lakehouse\nengine, so you don't have to supply DQ functions through this parameter. Use the\ndq_specs of the acon instead.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "

Terminator Specification.

\n\n

I.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.

\n\n
    \n
  • function: terminator function to execute.
  • \n
  • args: arguments of the terminator function.
  • \n
  • input_id: id of the corresponding output specification (Optional).
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "

Reconciliator Specification.

\n\n
    \n
  • metrics: list of metrics in the form of:\n[{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n}].
  • \n
  • recon_type: reconciliation type (percentage or absolute). Percentage calculates\nthe difference between truth and current results as a percentage (x-y/x), and\nabsolute calculates the raw difference (x - y).
  • \n
  • truth_input_spec: input specification of the truth data.
  • \n
  • current_input_spec: input specification of the current results data
  • \n
  • truth_preprocess_query: additional query on top of the truth input data to\npreprocess the truth data before it gets fueled into the reconciliation process.\nImportant note: you need to assume that the data out of\nthe truth_input_spec is referencable by a table called 'truth'.
  • \n
  • truth_preprocess_query_args: optional dict having the functions/transformations to\napply on top of the truth_preprocess_query and respective arguments. Note: cache\nis being applied on the Dataframe, by default. For turning the default behavior\noff, pass \"truth_preprocess_query_args\": [].
  • \n
  • current_preprocess_query: additional query on top of the current results input\ndata to preprocess the current results data before it gets fueled into the\nreconciliation process. Important note: you need to assume that the data out of\nthe current_results_input_spec is referencable by a table called 'current'.
  • \n
  • current_preprocess_query_args: optional dict having the\nfunctions/transformations to apply on top of the current_preprocess_query\nand respective arguments. Note: cache is being applied on the Dataframe,\nby default. For turning the default behavior off, pass\n\"current_preprocess_query_args\": [].
  • \n
  • ignore_empty_df: optional boolean, to ignore the recon process if source & target\ndataframes are empty, recon will exit success code (passed)
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "

Data Quality Validator Specification.

\n\n
    \n
  • input_spec: input specification of the data to be checked/validated.
  • \n
  • dq_spec: data quality specification.
  • \n
  • restore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "

SQL definitions statements.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "

File Manager s3 api keys.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "

Sensor Specification.

\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
  • \n
  • control_db_table_name: db.table to store sensor metadata.
  • \n
  • input_spec: input specification of the source to be checked for new data.
  • \n
  • preprocess_query: SQL query to transform/filter the result from the\nupstream. Consider that we should refer to 'new_data' whenever\nwe are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"
  • \n
  • checkpoint_location: optional location to store checkpoints to resume\nfrom. These checkpoints use the same as Spark checkpoint strategy.\nFor Spark readers that do not support checkpoints, use the\npreprocess_query parameter to form a SQL query to filter the result\nfrom the upstream accordingly.
  • \n
  • fail_on_empty_result: if the sensor should throw an error if there is no new\ndata in the upstream. Default: True.
  • \n
\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "

Create SensorSpec from acon.

\n\n
Arguments:
\n\n
    \n
  • acon: sensor ACON.
  • \n
\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "

Status for a sensor.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "

Defaults used on consuming data from SAP Logchain.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n
Return:
\n\n
\n

A list with all enum values.

\n
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "

Checks if the restore type exists in the enum values.

\n\n
Arguments:
\n\n
    \n
  • restore_type: restore type to check if exists.
  • \n
\n\n
Return:
\n\n
\n

If the restore type exists in our enum.

\n
\n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser", "kind": "class", "doc": "

Defaults to use for parsing.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.DOUBLE_QUOTES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.DOUBLE_QUOTES", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.DOUBLE_QUOTES: '"'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SINGLE_QUOTES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SINGLE_QUOTES", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.SINGLE_QUOTES: "'">"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.BACKSLASH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.BACKSLASH", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.BACKSLASH: '\\\\'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SINGLE_TRACE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SINGLE_TRACE", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.SINGLE_TRACE: '-'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.DOUBLE_TRACES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.DOUBLE_TRACES", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.DOUBLE_TRACES: '--'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SLASH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SLASH", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.SLASH: '/'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.OPENING_MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.OPENING_MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.OPENING_MULTIPLE_LINE_COMMENT: '/*'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.CLOSING_MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.CLOSING_MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.CLOSING_MULTIPLE_LINE_COMMENT: '*/'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.PARAGRAPH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.PARAGRAPH", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.PARAGRAPH: '\\n'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.STAR", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.STAR", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.STAR: '*'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

\n", "default_value": "<SQLParser.MULTIPLE_LINE_COMMENT: ['/*', '*/']>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults", "kind": "class", "doc": "

Defaults used on the GAB process.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DATE_FORMAT", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DATE_FORMAT", "kind": "variable", "doc": "

\n", "default_value": "<GABDefaults.DATE_FORMAT: '%Y-%m-%d'>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DIMENSIONS_DEFAULT_COLUMNS", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DIMENSIONS_DEFAULT_COLUMNS", "kind": "variable", "doc": "

\n", "default_value": "<GABDefaults.DIMENSIONS_DEFAULT_COLUMNS: ['from_date', 'to_date']>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE", "kind": "variable", "doc": "

\n", "default_value": "<GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE: 'dim_calendar'>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE", "kind": "variable", "doc": "

\n", "default_value": "<GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE: 'lkp_query_builder'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek", "kind": "class", "doc": "

Representation of start of week values on GAB.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.SUNDAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.SUNDAY", "kind": "variable", "doc": "

\n", "default_value": "<GABStartOfWeek.SUNDAY: 'S'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.MONDAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.MONDAY", "kind": "variable", "doc": "

\n", "default_value": "<GABStartOfWeek.MONDAY: 'M'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.get_start_of_week", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.get_start_of_week", "kind": "function", "doc": "

Get the start of week enum as a dict.

\n\n
Returns:
\n\n
\n

dict containing all enum entries as {name:value}.

\n
\n", "signature": "(cls) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.get_values", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.get_values", "kind": "function", "doc": "

Get the start of week enum values as set.

\n\n
Returns:
\n\n
\n

set containing all possible values {value}.

\n
\n", "signature": "(cls) -> set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec", "kind": "class", "doc": "

Gab Specification.

\n\n

query_label_filter: query use-case label to execute.\nqueue_filter: queue to execute the job.\ncadence_filter: selected cadences to build the asset.\ntarget_database: target database to write.\ncurr_date: current date.\nstart_date: period start date.\nend_date: period end date.\nrerun_flag: rerun flag.\ntarget_table: target table to write.\nsource_database: source database.\ngab_base_path: base path to read the use cases.\nlookup_table: gab configuration table.\ncalendar_table: gab calendar table.

\n"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tquery_label_filter: list[str],\tqueue_filter: list[str],\tcadence_filter: list[str],\ttarget_database: str,\tcurrent_date: datetime.datetime,\tstart_date: datetime.datetime,\tend_date: datetime.datetime,\trerun_flag: str,\ttarget_table: str,\tsource_database: str,\tgab_base_path: str,\tlookup_table: str,\tcalendar_table: str)"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec.create_from_acon", "kind": "function", "doc": "

Create GabSpec from acon.

\n\n
Arguments:
\n\n
    \n
  • acon: gab ACON.
  • \n
\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence", "kind": "class", "doc": "

Representation of the supported cadences on GAB.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.DAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.DAY", "kind": "variable", "doc": "

\n", "default_value": "<GABCadence.DAY: 1>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.WEEK", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.WEEK", "kind": "variable", "doc": "

\n", "default_value": "<GABCadence.WEEK: 2>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.MONTH", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.MONTH", "kind": "variable", "doc": "

\n", "default_value": "<GABCadence.MONTH: 3>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.QUARTER", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.QUARTER", "kind": "variable", "doc": "

\n", "default_value": "<GABCadence.QUARTER: 4>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.YEAR", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.YEAR", "kind": "variable", "doc": "

\n", "default_value": "<GABCadence.YEAR: 5>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.get_ordered_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.get_ordered_cadences", "kind": "function", "doc": "

Get the cadences ordered by the value.

\n\n
Returns:
\n\n
\n

dict containing ordered cadences as {name:value}.

\n
\n", "signature": "(cls) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.get_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.get_cadences", "kind": "function", "doc": "

Get the cadences values as set.

\n\n
Returns:
\n\n
\n

set containing all possible cadence values as {value}.

\n
\n", "signature": "(cls) -> set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.order_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.order_cadences", "kind": "function", "doc": "

Order a list of cadences by value.

\n\n
Returns:
\n\n
\n

ordered set containing the received cadences.

\n
\n", "signature": "(cls, cadences_to_order: list[str]) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABKeys", "kind": "class", "doc": "

Constants used to update pre-configured gab dict key.

\n"}, {"fullname": "lakehouse_engine.core.definitions.GABReplaceableKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABReplaceableKeys", "kind": "class", "doc": "

Constants used to replace pre-configured gab dict values.

\n"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration", "kind": "class", "doc": "

GAB combined configuration.

\n\n

Based on the use case configuration return the values to override in the SQL file.\nThis enum aims to exhaustively map each combination of cadence, reconciliation,\n week_start and snap_flag return the corresponding values join_select,\n project_start and project_end to replace this values in the stages SQL file.

\n\n

Return corresponding configuration (join_select, project_start, project_end) for\n each combination (cadence x recon x week_start x snap_flag).

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration.COMBINED_CONFIGURATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration.COMBINED_CONFIGURATION", "kind": "variable", "doc": "

\n", "default_value": "<GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n calendar_date as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'QUARTER', 'DAY', 'MONTH', 'YEAR'}, 'week_start': 'M', 'snap_flag': {'N', 'Y'}, 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n case\\n when '${config_week_start}' = 'Monday' then weekend_mon\\n when '${config_week_start}' = 'Sunday' then weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct month_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_mon\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_sun\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n month_end as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 13: {'cadence': 'QUARTER', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\\n )\\n )-1\\n "}, 17: {'cadence': 'YEAR', 'recon': {'QUARTER', 'DAY', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when '${rec_cadence}' = 'DAY' then calendar_date\\n when '${rec_cadence}' = 'MONTH' then month_end\\n when '${rec_cadence}' = 'QUARTER' then quarter_end\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'recon': {'WEEK', 'YEAR', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'N', 'Y'}, 'join_select': "\\n select distinct\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekstart_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekstart_sun\\n else\\n date(date_trunc('${cad}',calendar_date))\\n end as cadence_start_date,\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekend_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekend_sun\\n when '${cad}' = 'DAY'\\n then date(date_trunc('${cad}',calendar_date))\\n when '${cad}' = 'MONTH'\\n then date(\\n date_trunc(\\n 'MONTH',\\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\\n )\\n )-1\\n when '${cad}' = 'QUARTER'\\n then date(\\n date_trunc(\\n 'QUARTER',\\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\\n )\\n )-1\\n when '${cad}' = 'YEAR'\\n then date(\\n date_trunc(\\n 'YEAR',\\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\\n )\\n )-1\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "

Module to take care of creating a singleton of the execution environment class.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "

Represents the basic resources regarding the engine execution environment.

\n\n

Currently, it is used to encapsulate both the logic to get the Spark\nsession and the engine configurations.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.set_default_engine_config", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.set_default_engine_config", "kind": "function", "doc": "

Set default engine configurations by reading them from a specified package.

\n\n
Arguments:
\n\n
    \n
  • package: package where the engine configurations can be found.
  • \n
\n", "signature": "(cls, package: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "

Get or create an execution environment session (currently Spark).

\n\n

It instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.

\n\n
Arguments:
\n\n
    \n
  • session: spark session.
  • \n
  • enable_hive_support: whether to enable hive support or not.
  • \n
  • app_name: application name.
  • \n
  • config: extra spark configs to supply to the spark session.
  • \n
\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "

Module representing an executable lakehouse engine component.

\n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "

Abstract class defining the behaviour of an executable component.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "

Define the executable component behaviour.

\n\n

E.g., the behaviour of an algorithm inheriting from this.

\n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "

Module for abstract representation of a file manager system.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "

Abstract file manager class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "

Construct FileManager algorithm instances.

\n\n
Arguments:
\n\n
    \n
  • configs: configurations for the FileManager algorithm.
  • \n
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "

Delete objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "

Copies objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.move_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.move_objects", "kind": "function", "doc": "

Moves objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManagerFactory", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManagerFactory", "kind": "class", "doc": "

Class for file manager factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.file_manager.FileManagerFactory.execute_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManagerFactory.execute_function", "kind": "function", "doc": "

Get a specific File Manager and function to execute.

\n", "signature": "(configs: dict) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager", "modulename": "lakehouse_engine.core.gab_manager", "kind": "module", "doc": "

Module to define GAB Manager classes.

\n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager", "kind": "class", "doc": "

Class to control the GAB Cadence Window.

\n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager.extended_window_calculator", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager.extended_window_calculator", "kind": "function", "doc": "

extended_window_calculator function.

\n\n

Calculates the extended window of any cadence despite the user providing\ncustom dates which are not the exact start and end dates of a cadence.

\n\n
Arguments:
\n\n
    \n
  • cadence: cadence to process
  • \n
  • reconciliation_cadence: reconciliation to process.
  • \n
  • current_date: current date.
  • \n
  • start_date_str: start date of the period to process.
  • \n
  • end_date_str: end date of the period to process.
  • \n
  • query_type: use case query type.
  • \n
  • rerun_flag: flag indicating if it's a rerun or a normal run.
  • \n
  • snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
  • \n
\n", "signature": "(\tself,\tcadence: str,\treconciliation_cadence: str,\tcurrent_date: datetime.datetime,\tstart_date_str: str,\tend_date_str: str,\tquery_type: str,\trerun_flag: str,\tsnapshot_flag: str) -> tuple[datetime.datetime, datetime.datetime, datetime.datetime, datetime.datetime]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager.get_cadence_start_end_dates", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager.get_cadence_start_end_dates", "kind": "function", "doc": "

Generate the new set of extended start and end dates based on the cadence.

\n\n

Running week cadence again to extend to correct week start and end date in case\n of recon window for Week cadence is present.\nFor end_date 2012-12-31,in case of Quarter Recon window present for Week\n cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31.\nBut these are not start and end dates of week. Hence, to correct this, new dates\n are passed again to get the correct dates.

\n\n
Arguments:
\n\n
    \n
  • cadence: cadence to process.
  • \n
  • derived_cadence: cadence reconciliation to process.
  • \n
  • start_date: start date of the period to process.
  • \n
  • end_date: end date of the period to process.
  • \n
  • query_type: use case query type.
  • \n
  • current_date: current date to be used in the end date, in case the end date\nis greater than current date so the end date should be the current date.
  • \n
\n", "signature": "(\tself,\tcadence: str,\tderived_cadence: str,\tstart_date: datetime.datetime,\tend_date: datetime.datetime,\tquery_type: str,\tcurrent_date: datetime.datetime) -> tuple[datetime.datetime, datetime.datetime]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager", "kind": "class", "doc": "

Class to control the GAB View creation.

\n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager.__init__", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager.__init__", "kind": "function", "doc": "

Construct GABViewManager instances.

\n\n
Arguments:
\n\n
    \n
  • query_id: gab configuration table use case identifier.
  • \n
  • lookup_query_builder: gab configuration data.
  • \n
  • target_database: target database to write.
  • \n
  • target_table: target table to write.
  • \n
\n", "signature": "(\tquery_id: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager.generate_use_case_views", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager.generate_use_case_views", "kind": "function", "doc": "

Generate all the use case views.

\n\n

Generates the DDLs for each of the views. This DDL is dynamically built based on\nthe mappings provided in the config table.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator", "modulename": "lakehouse_engine.core.gab_sql_generator", "kind": "module", "doc": "

Module to define GAB SQL classes.

\n"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABSQLGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABSQLGenerator", "kind": "class", "doc": "

Abstract class defining the behaviour of a GAB SQL Generator.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABSQLGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABSQLGenerator.generate_sql", "kind": "function", "doc": "

Define the generate sql command.

\n\n

E.g., the behaviour of gab generate sql inheriting from this.

\n", "signature": "(self) -> Optional[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator", "kind": "class", "doc": "

GAB insert generator.

\n\n

Creates the insert statement based on the dimensions and metrics provided in\nthe configuration table.

\n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator.__init__", "kind": "function", "doc": "

Construct GABInsertGenerator instances.

\n\n
Arguments:
\n\n
    \n
  • query_id: gab configuration table use case identifier.
  • \n
  • cadence: inputted cadence to process.
  • \n
  • final_stage_table: stage view name.
  • \n
  • lookup_query_builder: gab configuration data.
  • \n
  • target_database: target database to write.
  • \n
  • target_table: target table to write.
  • \n
\n", "signature": "(\tquery_id: str,\tcadence: str,\tfinal_stage_table: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator.generate_sql", "kind": "function", "doc": "

Generate insert sql statement to the insights table.

\n", "signature": "(self) -> Optional[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator", "kind": "class", "doc": "

GAB view generator.

\n\n

Creates the use case view statement to be consumed.

\n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator.__init__", "kind": "function", "doc": "

Construct GABViewGenerator instances.

\n\n
Arguments:
\n\n
    \n
  • cadence_snapshot_status: each cadence with the corresponding snapshot\nstatus.
  • \n
  • target_database: target database to write.
  • \n
  • view_name: name of the view to be generated.
  • \n
  • final_cols: columns to return in the view.
  • \n
  • target_table: target table to write.
  • \n
  • dimensions_and_metrics_with_alias: configured dimensions and metrics with\nalias to compute in the view.
  • \n
  • dimensions: use case configured dimensions.
  • \n
  • dimensions_and_metrics: use case configured dimensions and metrics.
  • \n
  • final_calculated_script: use case calculated metrics.
  • \n
  • query_id: gab configuration table use case identifier.
  • \n
  • view_filter: filter to add in the view.
  • \n
  • final_calculated_script_snapshot: use case calculated metrics with snapshot.
  • \n
  • without_snapshot_cadences: cadences without snapshot.
  • \n
  • with_snapshot_cadences: cadences with snapshot.
  • \n
\n", "signature": "(\tcadence_snapshot_status: dict,\ttarget_database: str,\tview_name: str,\tfinal_cols: str,\ttarget_table: str,\tdimensions_and_metrics_with_alias: str,\tdimensions: str,\tdimensions_and_metrics: str,\tfinal_calculated_script: str,\tquery_id: str,\tview_filter: str,\tfinal_calculated_script_snapshot: str,\twithout_snapshot_cadences: list[str],\twith_snapshot_cadences: list[str])"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator.generate_sql", "kind": "function", "doc": "

Generate use case view sql statement.

\n", "signature": "(*args: Any) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator", "kind": "class", "doc": "

GAB delete generator.

\n\n

Creates the delete statement to clean the use case base data on the insights table.

\n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator.__init__", "kind": "function", "doc": "

Construct GABViewGenerator instances.

\n\n
Arguments:
\n\n
    \n
  • query_id: gab configuration table use case identifier.
  • \n
  • cadence: inputted cadence to process.
  • \n
  • temp_stage_view_name: stage view name.
  • \n
  • lookup_query_builder: gab configuration data.
  • \n
  • target_database: target database to write.
  • \n
  • target_table: target table to write.
  • \n
\n", "signature": "(\tquery_id: str,\tcadence: str,\ttemp_stage_view_name: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator.generate_sql", "kind": "function", "doc": "

Generate delete sql statement.

\n\n

This statement is to clean the insights table for the corresponding use case.

\n", "signature": "(*args: Any) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager", "modulename": "lakehouse_engine.core.s3_file_manager", "kind": "module", "doc": "

File manager module using boto3.

\n"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager", "kind": "class", "doc": "

Set of actions to manipulate s3 files in several ways.

\n", "bases": "lakehouse_engine.core.file_manager.FileManager"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.get_function", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.delete_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.delete_objects", "kind": "function", "doc": "

Delete objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.copy_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.copy_objects", "kind": "function", "doc": "

Copies objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.move_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.move_objects", "kind": "function", "doc": "

Moves objects and 'directories'.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.request_restore", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.check_restore_status", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n

After the restore is done, copy the restored files to destination

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "

Set of actions to restore archives.

\n"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n\n
Arguments:
\n\n
    \n
  • source_bucket: name of bucket to check the restore status.
  • \n
  • source_object: object to check the restore status.
  • \n
\n\n
Returns:
\n\n
\n

A dict containing the amount of objects in each status.

\n
\n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n\n
Arguments:
\n\n
    \n
  • source_bucket: name of bucket to perform the restore.
  • \n
  • source_object: object to be restored.
  • \n
  • restore_expiration: restore expiration in days.
  • \n
  • retrieval_tier: type of restore, possible values are:\nBulk, Standard or Expedited.
  • \n
  • dry_run: if dry_run is set to True the function will print a dict with\nall the paths that would be deleted based on the given keys.
  • \n
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n
Arguments:
\n\n
    \n
  • source_bucket: name of bucket to perform the restore.
  • \n
  • source_object: object to be restored.
  • \n
  • restore_expiration: restore expiration in days.
  • \n
  • retrieval_tier: type of restore, possible values are:\nBulk, Standard or Expedited.
  • \n
  • dry_run: if dry_run is set to True the function will print a dict with\nall the paths that would be deleted based on the given keys.
  • \n
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "

Module to define Sensor Manager classes.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "

Class to control the Sensor execution.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "

Check if sensor has acquired new data.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • control_db_table_name: db.table to control sensor runs.
  • \n
\n\n
Returns:
\n\n
\n

True if acquired new data, otherwise False

\n
\n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "

Control sensor execution storing the execution data in a delta table.

\n\n
Arguments:
\n\n
    \n
  • sensor_spec: sensor spec containing all sensor\ninformation we need to update the control status.
  • \n
  • status: status of the sensor.
  • \n
  • upstream_key: upstream key (e.g., used to store an attribute\nname from the upstream so that new data can be detected\nautomatically).
  • \n
  • upstream_value: upstream value (e.g., used to store the max\nattribute value from the upstream so that new data can be\ndetected automatically).
  • \n
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "

Read data from delta table containing sensor status info.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id. If this parameter is defined search occurs\nonly considering this parameter. Otherwise, it considers sensor\nassets and checkpoint location.
  • \n
  • control_db_table_name: db.table to control sensor runs.
  • \n
  • assets: list of assets that are fueled by the pipeline\nwhere this sensor is.
  • \n
\n\n
Return:
\n\n
\n

Row containing the data for the provided sensor_id.

\n
\n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "

Class to deal with Sensor Upstream data.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "

Generates a sensor preprocess query based on timestamp logic.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?upstream_value so that\nit can be replaced by the upstream_value in the\ncontrol_db_table_name for this specific sensor_id.
  • \n
  • control_db_table_name: db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
  • \n
  • upstream_key: the key of custom sensor information\nto control how to identify new data from the\nupstream (e.g., a time column in the upstream).
  • \n
  • upstream_value: value for custom sensor\nto identify new data from the upstream\n(e.g., the value of a time present in the upstream)\nIf none we will set the default value.\nNote: This parameter is used just to override the\ndefault value -2147483647.
  • \n
  • upstream_table_name: value for custom sensor\nto query new data from the upstream.\nIf none we will set the default value,\nour sensor_new_data view.
  • \n
\n\n
Return:
\n\n
\n

The query string.

\n
\n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "

Generates a query to be used for a sensor having other sensor as upstream.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
\n\n
Return:
\n\n
\n

The query string.

\n
\n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "

Read new data from the upstream into the sensor 'new_data_df'.

\n\n
Arguments:
\n\n
    \n
  • sensor_spec: sensor spec containing all sensor information.
  • \n
\n\n
Return:
\n\n
\n

An empty dataframe if it doesn't have new data otherwise the new data

\n
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "

Get new data from upstream df if it's present.

\n\n
Arguments:
\n\n
    \n
  • new_data_df: DataFrame possibly containing new data.
  • \n
\n\n
Return:
\n\n
\n

Optional row, present if there is new data in the upstream,\n absent otherwise.

\n
\n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n
Arguments:
\n\n
    \n
  • chain_id: chain id to query the status on SAP.
  • \n
  • dbtable: db.table to retrieve the data to\ncheck if the sap chain is already finished.
  • \n
  • status: db.table to retrieve the last status change\ntimestamp.
  • \n
  • engine_table_name: table name exposed with the SAP LOGCHAIN data.\nThis table will be used in the jdbc query.
  • \n
\n\n
Return:
\n\n
\n

The query string.

\n
\n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "

Table manager module.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "

Set of actions to manipulate tables/views in several ways.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "

Construct TableManager algorithm instances.

\n\n
Arguments:
\n\n
    \n
  • configs: configurations for the TableManager algorithm.
  • \n
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "

Create a new table or view on metastore.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "

Create multiple tables or views on metastore.

\n\n

In this function the path to the ddl files can be separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "

Compute table statistics.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "

Delete table function deletes table from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "

Delete view function deletes view from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "

Truncate function erases all data but keeps metadata.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "

Vacuum function erases older versions from Delta Lake tables or locations.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "

Describe function describes metadata from some table or view.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "

Optimize function optimizes the layout of Delta Lake data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "

Execute multiple statements in multiple sql files.

\n\n

In this function the path to the files is separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "

Execute sql commands separated by semicolon (;).

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "

Show Table Properties.

\n\n
Returns:
\n\n
\n

A dataframe with the table properties.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "

Get the primary key of a particular table.

\n\n
Returns:
\n\n
\n

The list of columns that are part of the primary key.

\n
\n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "

Run the repair table command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "

Run the delete where command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "

Package to define data quality processes available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "

Package containing custom DQ expectations available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "

Expectation to check if column 'a' is lower or equal than column 'b'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "

Asserts that column 'A' is lower or equal than column 'B'.

\n\n

Additionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "

Expect values in column A to be lower or equal than column B.

\n\n
Arguments:
\n\n
    \n
  • column_A: The first column name.
  • \n
  • column_B: The second column name.
  • \n
  • margin: additional approximation to column B value.
  • \n
\n\n
Keyword Args:
\n\n
\n
    \n
  • allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
  • \n
  • ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).
  • \n
  • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.
  • \n
  • include_config: If True (default), then include the expectation config\n as part of the result object.
  • \n
  • catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.
  • \n
  • meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.
  • \n
\n
\n\n
Returns:
\n\n
\n

An ExpectationSuiteValidationResult.

\n
\n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "

Expectation to check if column value is a date within a timeframe.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "

Asserts that column values are a date that isn't older than a given date.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "

Expect value in column to be date that is not older than a given time.

\n\n

Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.

\n\n
Arguments:
\n\n
    \n
  • column: Name of column to validate
  • \n
  • Note: Column must be of type Date, Timestamp or String (with Timestamp format).\nFormat: yyyy-MM-ddTHH:mm:ss
  • \n
  • timeframe: dict with the definition of the timeframe.
  • \n
  • kwargs: dict with additional parameters.
  • \n
\n\n
Keyword Args:
\n\n
\n
    \n
  • allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
  • \n
  • ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).
  • \n
  • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.
  • \n
  • include_config: If True (default), then include the expectation config\n as part of the result object.
  • \n
  • catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.
  • \n
  • meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.
  • \n
\n
\n\n
Returns:
\n\n
\n

An ExpectationSuiteValidationResult.

\n
\n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "

Expectation to check if column 'a' equals 'b', or 'c'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "

Expectation metric definition.

\n\n

This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "

Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'.

\n\n
Arguments:
\n\n
    \n
  • column_list: The column names to evaluate.
  • \n
\n\n
Keyword Args:
\n\n
\n
    \n
  • ignore_row_if: default to \"never\".
  • \n
  • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC, COMPLETE, or SUMMARY.\n Default set to BASIC.
  • \n
  • include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.
  • \n
  • catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.
  • \n
\n
\n\n
Returns:
\n\n
\n

An ExpectationSuiteValidationResult.

\n
\n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "

Expectation to check if aggregated column satisfy the condition.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "

Expect agg of column to satisfy the condition specified.

\n\n
Arguments:
\n\n
    \n
  • template_dict: dict with the following keys:\n
      \n
    • column (column to check sum).
    • \n
    • group_column_list (group by column names to be listed).
    • \n
    • condition (how to validate the aggregated value eg: between,\ngreater, lesser).
    • \n
    • max_value (maximum allowed value).
    • \n
    • min_value (minimum allowed value).
    • \n
    • agg_type (sum/count/max/min).
    • \n
  • \n
\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "

Validates that a configuration has been set.

\n\n
Arguments:
\n\n
    \n
  • configuration (OPTIONAL[ExpectationConfiguration]): An optional Expectation Configuration entry.
  • \n
\n\n
Returns:
\n\n
\n

None. Raises InvalidExpectationConfigurationError

\n
\n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "

Module containing the class definition of the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "

Class for the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "

Run the specified data quality process on a dataframe.

\n\n

Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.

\n\n
Arguments:
\n\n
    \n
  • dq_spec: data quality specification.
  • \n
  • data: input dataframe to run the dq process on.
  • \n
\n\n
Returns:
\n\n
\n

The DataFrame containing the results of the DQ process.

\n
\n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.build_data_docs", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.build_data_docs", "kind": "function", "doc": "

Build Data Docs for the project.

\n\n

This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.

\n\n
Arguments:
\n\n
    \n
  • store_backend: which store_backend to use (e.g. s3 or file_system).
  • \n
  • local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
  • \n
  • data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
  • \n
  • data_docs_prefix: prefix where to store data_docs' data.
  • \n
  • bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
  • \n
  • data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
  • \n
  • expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
  • \n
  • validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
  • \n
  • checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
  • \n
\n", "signature": "(\tcls,\tstore_backend: str = 's3',\tlocal_fs_root_dir: str = None,\tdata_docs_local_fs: str = None,\tdata_docs_prefix: str = 'dq/data_docs/site/',\tbucket: str = None,\tdata_docs_bucket: str = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/') -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "

Package defining all the DQ custom exceptions.

\n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "

Exception for when the data quality validations fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "

Exception for when the checkpoint results parsing fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "

Module containing the definition of a data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "

Class containing the data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "

Get a validator according to the specification.

\n\n

We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.

\n\n
Arguments:
\n\n
    \n
  • context: the BaseDataContext containing the configurations for the data\nsource and store backend.
  • \n
  • batch_request: run time batch request to be able to query underlying data.
  • \n
  • expectation_suite_name: name of the expectation suite.
  • \n
  • dq_functions: a list of DQFunctionSpec to consider in the expectation suite.
  • \n
  • critical_functions: list of critical expectations in the expectation suite.
  • \n
\n\n
Returns:
\n\n
\n

The validator with the expectation suite stored.

\n
\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "

Tags the source dataframe with a new column having the DQ results.

\n\n
Arguments:
\n\n
    \n
  • source_pk: the primary key of the source data.
  • \n
  • source_df: the source dataframe to be tagged with DQ results.
  • \n
  • results_df: dq results dataframe.
  • \n
\n\n

Returns: a dataframe tagged with the DQ results.

\n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "

Contract of the lakehouse engine with all the available functions to be executed.

\n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "

Load data using the DataLoader algorithm.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks or other\napps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "

Execute the Reconciliator algorithm.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks or other\napps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "

Execute the DQValidator algorithm.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks or other\napps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "

Manipulate tables/views using Table Manager algorithm.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks\nor other apps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "

Manipulate s3 files using File Manager algorithm.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks\nor other apps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "

Execute a sensor based on a Sensor Algorithm Configuration.

\n\n

A sensor is useful to check if an upstream system has new data.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks\nor other apps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • control_db_table_name: db.table to store sensor checkpoints.
  • \n
  • status: status of the sensor.
  • \n
  • assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
  • \n
\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "

Generates a preprocess query to be used in a sensor configuration.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?default_upstream_key and\n?default_upstream_value, so that it can be replaced by the\nrespective values in the control_db_table_name for this specific\nsensor_id.
  • \n
  • control_db_table_name: db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
  • \n
  • upstream_key: the key of custom sensor information to control how to\nidentify new data from the upstream (e.g., a time column in the\nupstream).
  • \n
  • upstream_value: the upstream value\nto identify new data from the upstream (e.g., the value of a time\npresent in the upstream).
  • \n
  • upstream_table_name: value for custom sensor\nto query new data from the upstream\nIf none we will set the default value,\nour sensor_new_data view.
  • \n
\n\n
Return:
\n\n
\n

The query string.

\n
\n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n
Arguments:
\n\n
    \n
  • chain_id: chain id to query the status on SAP.
  • \n
  • dbtable: db.table to retrieve the data to\ncheck if the sap chain is already finished.
  • \n
  • status: db.table to retrieve the last status change\ntimestamp.
  • \n
  • engine_table_name: table name exposed with the SAP LOGCHAIN data.\nThis table will be used in the jdbc query.
  • \n
\n\n
Return:
\n\n
\n

The query string.

\n
\n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "

Send a notification using a notifier.

\n\n
Arguments:
\n\n
    \n
  • args: arguments for the notifier.
  • \n
\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.build_data_docs", "modulename": "lakehouse_engine.engine", "qualname": "build_data_docs", "kind": "function", "doc": "

Build Data Docs for the project.

\n\n

This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.

\n\n
Arguments:
\n\n
    \n
  • store_backend: which store_backend to use (e.g. s3 or file_system).
  • \n
  • local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
  • \n
  • data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
  • \n
  • data_docs_prefix: prefix where to store data_docs' data.
  • \n
  • bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
  • \n
  • data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
  • \n
  • expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
  • \n
  • validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
  • \n
  • checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
  • \n
\n", "signature": "(\tstore_backend: str = 's3',\tlocal_fs_root_dir: str = None,\tdata_docs_local_fs: str = None,\tdata_docs_prefix: str = 'dq/data_docs/site/',\tbucket: str = None,\tdata_docs_bucket: str = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/') -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_gab", "modulename": "lakehouse_engine.engine", "qualname": "execute_gab", "kind": "function", "doc": "

Execute the gold asset builder based on a GAB Algorithm Configuration.

\n\n

GaB is useful to build your gold assets with predefined functions for recurrent\nperiods.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks\nor other apps).
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when collecting\nthe engine usage.
  • \n
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "

Input and Output package responsible for the behaviour of reading and writing.

\n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "

Package defining all the io custom exceptions.

\n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "

Exception for when the input of an incremental filter is not found.

\n\n

This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "

Exception for when a user provides a wrong I/O format.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "

Exception for when a user provides a not supported operation.

\n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "

Defines abstract reader behaviour.

\n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "

Abstract Reader class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "

Construct Reader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification for reading data.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "

Abstract read method.

\n\n
Returns:
\n\n
\n

A dataframe read according to the input specification.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "

Module for reader factory.

\n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "

Class for reader factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "

Get data according to the input specification following a factory pattern.

\n\n
Arguments:
\n\n
    \n
  • spec: input specification to get the data.
  • \n
\n\n
Returns:
\n\n
\n

A dataframe containing the data.

\n
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "

Readers package to define reading behaviour.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "

Module to define behaviour to read from dataframes.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "

Class to read data from a dataframe.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "

Construct DataFrameReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "

Read data from a dataframe.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from a dataframe previously\n computed.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "

Module to define behaviour to read from files.

\n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "

Class to read from files.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "

Construct FileReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "

Read file data.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the files.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "

Module to define behaviour to read from JDBC sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "

Class to read from JDBC source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "

Construct JDBCReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "

Read data from JDBC source.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the JDBC source.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "

Module to define behaviour to read from Kafka.

\n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "

Class to read from Kafka.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "

Construct KafkaReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "

Read Kafka data.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from Kafka.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "

Module to define behaviour to read from a query.

\n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "

Class to read data from a query.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "

Construct QueryReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "

Read data from a query.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the query.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP B4 sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "

Class to read from SAP B4 source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "

Construct SAPB4Reader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "

Read data from SAP B4 source.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the SAP B4 source.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP BW sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "

Class to read from SAP BW source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "

Construct SAPBWReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "

Read data from SAP BW source.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the SAP BW source.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "

Module to define behaviour to read from SFTP.

\n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "

Class to read from SFTP.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "

Construct SFTPReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "

Read SFTP data.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from SFTP.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "

Module to define behaviour to read from tables.

\n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "

Class to read data from a table.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "

Construct TableReader instances.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "

Read data from a table.

\n\n
Returns:
\n\n
\n

A dataframe containing the data from the table.

\n
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "

Defines abstract writer behaviour.

\n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "

Abstract Writer class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "

Construct Writer instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification to write data.
  • \n
  • df: dataframe to write.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "

Abstract write method.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "

Define how to write a streaming micro batch after transforming it.

\n\n

This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.

\n\n
Arguments:
\n\n
    \n
  • kwargs: any keyword arguments.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the foreachBatch spark write method.

\n
\n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "

Get the result of the transformations applied to a micro batch dataframe.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification associated with the writer.
  • \n
  • batch_df: batch dataframe (given from streaming foreachBatch).
  • \n
  • batch_id: if of the batch (given from streaming foreachBatch).
  • \n
  • data: list of all dfs generated on previous steps before writer\nto be available on micro batch transforms.
  • \n
\n\n
Returns:
\n\n
\n

The transformed dataframe.

\n
\n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "

Define which streaming trigger will be used.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification.
  • \n
\n\n
Returns:
\n\n
\n

A dict containing streaming trigger.

\n
\n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "

Run the data quality process in a streaming micro batch dataframe.

\n\n

Iterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.

\n\n
Arguments:
\n\n
    \n
  • df: the dataframe in which to run the dq process on.
  • \n
  • dq_spec: data quality specification.
  • \n
\n\n

Returns: the validated dataframe.

\n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "

Module for writer factory.

\n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "

Class for writer factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "

Get a writer according to the output specification using a factory pattern.

\n\n
Arguments:
\n\n
    \n
  • spec: output specification to write data.
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n\n
Returns:
\n\n
\n

Writer: writer that will write the data.

\n
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "

Package containing the writers responsible for writing data.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "

Module to define behaviour to write to console.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "

Class to write data to console.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "

Construct ConsoleWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "

Write data to console.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "

Module to define behaviour to write to dataframe.

\n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "

Class to write data to dataframe.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "

Construct DataFrameWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification.
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "

Write data to dataframe.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "

Module to define the behaviour of delta merges.

\n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "

Class to merge data using delta lake.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "

Construct DeltaMergeWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification containing merge options and\nrelevant information.
  • \n
  • df: the dataframe containing the new data to be merged.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "

Merge new data with current data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "

Module to define behaviour to write to files.

\n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "

Class to write data to files.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "

Construct FileWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "

Write data to files.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to JDBC targets.

\n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "

Class to write to JDBC targets.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "

Construct JDBCWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification.
  • \n
  • df: dataframe to be writen.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "

Write data into JDBC target.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to Kafka.

\n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "

Class to write to a Kafka target.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "

Construct KafkaWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification.
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "

Write data to Kafka.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to tables.

\n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "

Class to write to a table.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "

Construct TableWriter instances.

\n\n
Arguments:
\n\n
    \n
  • output_spec: output specification.
  • \n
  • df: dataframe to be written.
  • \n
  • data: list of all dfs generated on previous steps before writer.
  • \n
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "

Write data to a table.

\n\n

After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "

Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "

Defines change data feed processor behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "

Change data feed processor class.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "

Expose CDF to external location.

\n\n
Arguments:
\n\n
    \n
  • spec: terminator specification.
  • \n
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "

Delete old data from cdf delta table.

\n\n
Arguments:
\n\n
    \n
  • spec: terminator specifications.
  • \n
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "

Vacuum old data from cdf delta table.

\n\n
Arguments:
\n\n
    \n
  • spec: terminator specifications.
  • \n
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "

Module with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "

Class with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "

Optimize a dataset based on a set of pre-conceived optimizations.

\n\n

Most of the time the dataset is a table, but it can be a file-based one only.

\n\n
Arguments:
\n\n
    \n
  • db_table: database_name.table_name.
  • \n
  • location: dataset/table filesystem location.
  • \n
  • compute_table_stats: to compute table statistics or not.
  • \n
  • vacuum: (delta lake tables only) whether to vacuum the delta lake\ntable or not.
  • \n
  • vacuum_hours: (delta lake tables only) number of hours to consider\nin vacuum operation.
  • \n
  • optimize: (delta lake tables only) whether to optimize the table or\nnot. Custom optimize parameters can be supplied through ExecEnv (Spark)\nconfigs
  • \n
  • optimize_where: expression to use in the optimize function.
  • \n
  • optimize_zorder_col_list: (delta lake tables only) list of\ncolumns to consider in the zorder optimization process. Custom optimize\nparameters can be supplied through ExecEnv (Spark) configs.
  • \n
  • debug: flag indicating if we are just debugging this for local\ntests and therefore pass through all the exceptions to perform some\nassertions in local tests.
  • \n
\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "

Module with notification terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "

Abstract Notification class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "

Construct Notification instances.

\n\n
Arguments:
\n\n
    \n
  • notification_spec: notification specification.
  • \n
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "

Abstract create notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "

Abstract send notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "

Check if given notification is a failure notification.

\n\n
Arguments:
\n\n
    \n
  • spec: spec to validate if it is a failure notification.
  • \n
\n\n
Returns:
\n\n
\n

A boolean telling if the notification is a failure notification

\n
\n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "

Module for notifier factory.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "

Class for notification factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "

Get a notifier according to the terminator specs using a factory.

\n\n
Arguments:
\n\n
    \n
  • spec: terminator specification.
  • \n
\n\n
Returns:
\n\n
\n

Notifier: notifier that will handle notifications.

\n
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "

Check if it is necessary to send a failure notification and generate it.

\n\n
Arguments:
\n\n
    \n
  • spec: List of termination specs
  • \n
  • exception: Exception that caused the failure.
  • \n
\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "

Notifications module.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "

Module with email notifier.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "

Base Notification class.

\n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "

Construct Email Notification instance.

\n\n
Arguments:
\n\n
    \n
  • notification_spec: notification specification.
  • \n
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "

Creates the notification to be sent.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "

Sends the notification by using a series of methods.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "

Email notification templates.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "

Templates for notifications.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "

Module with sensor terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "

Sensor Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.

\n\n
Arguments:
\n\n
    \n
  • sensor_id: sensor id.
  • \n
  • control_db_table_name: db.table to store sensor checkpoints.
  • \n
  • status: status of the sensor.
  • \n
  • assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
  • \n
\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "

Module with spark terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "

Spark Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "

Terminate spark session.

\n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "

Module with the factory pattern to return terminators.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "

TerminatorFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "

Execute a terminator following the factory pattern.

\n\n
Arguments:
\n\n
    \n
  • spec: terminator specification.
  • \n
  • df: dataframe to be used in the terminator. Needed when a\nterminator requires one dataframe as input.
  • \n
\n\n
Returns:
\n\n
\n

Transformer function to be executed in .transform() spark function.

\n
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "

Package to define transformers available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "

Aggregators module.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "

Class containing all aggregation functions.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "

Get the maximum value of a given column of a dataframe.

\n\n
Arguments:
\n\n
    \n
  • input_col: name of the input column.
  • \n
  • output_col: name of the output column (defaults to \"latest\").
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "

Column creators transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "

Class containing all functions that can create columns to add value.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "

Create a sequential but not consecutive id.

\n\n
Arguments:
\n\n
    \n
  • output_col: optional name of the output column.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "

Create a sequential and consecutive id.

\n\n
Arguments:
\n\n
    \n
  • output_col: optional name of the output column.
  • \n
  • rdd: optional parameter to use spark rdd.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cls, output_col: str = 'lhe_row_id', rdd: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "

Create columns given a map of column names and literal values (constants).

\n\n
Arguments:
\n\n
    \n
  • Dict[str, Any] literals: map of column names and literal values (constants).
  • \n
\n\n
Returns:
\n\n
\n

Callable: A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "

Module with column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "

Class containing column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "

Cast specific columns into the designated type.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "

Select specific columns with specific output aliases.

\n\n
Arguments:
\n\n
    \n
  • cols: dict with columns to select and respective aliases.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "

Flatten the schema of the dataframe.

\n\n
Arguments:
\n\n
    \n
  • max_level: level until which you want to flatten the schema.\nDefault: None.
  • \n
  • shorten_names: whether to shorten the names of the prefixes\nof the fields being flattened or not. Default: False.
  • \n
  • alias: whether to define alias for the columns being flattened\nor not. Default: True.
  • \n
  • num_chars: number of characters to consider when shortening\nthe names of the fields. Default: 7.
  • \n
  • ignore_cols: columns which you don't want to flatten.\nDefault: None.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "

Explode columns with types like ArrayType and MapType.

\n\n

After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.

\n\n
Arguments:
\n\n
    \n
  • explode_arrays: whether you want to explode array columns (True)\nor not (False). Default: False.
  • \n
  • array_cols_to_explode: array columns which you want to explode.\nIf you don't specify it will get all array columns and explode them.\nDefault: None.
  • \n
  • explode_maps: whether you want to explode map columns (True)\nor not (False). Default: False.
  • \n
  • map_cols_to_explode: map columns which you want to explode.\nIf you don't specify it will get all map columns and explode them.\nDefault: None.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "

Execute Spark SQL expressions to create the specified columns.

\n\n

This function uses the Spark expr function. Check here.

\n\n
Arguments:
\n\n
    \n
  • cols_and_exprs: dict with columns and respective expressions to compute\n(Spark SQL expressions).
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "

Rename specific columns into the designated name.

\n\n
Arguments:
\n\n
    \n
  • cols: dict with columns and respective target names.
  • \n
  • escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not.\nIf True it creates a column with the new name and drop the old one.\nIf False, uses the native withColumnRenamed Spark function.\nDefault: True.
  • \n
\n\n
Returns:
\n\n
\n

Function to be called in .transform() spark function.

\n
\n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "

Select all attributes from avro.

\n\n
Arguments:
\n\n
    \n
  • schema: the schema string.
  • \n
  • key_col: the name of the key column.
  • \n
  • value_col: the name of the value column.
  • \n
  • options: extra options (e.g., mode: \"PERMISSIVE\").
  • \n
  • expand_key: whether you want to expand the content inside the key\ncolumn or not. Default: false.
  • \n
  • expand_value: whether you want to expand the content inside the value\ncolumn or not. Default: true.
  • \n
\n\n
Returns:
\n\n
\n

Function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "

Select all attributes from avro using a schema registry.

\n\n
Arguments:
\n\n
    \n
  • schema_registry: the url to the schema registry.
  • \n
  • value_schema: the name of the value schema entry in the schema registry.
  • \n
  • value_col: the name of the value column.
  • \n
  • key_schema: the name of the key schema entry in the schema\nregistry. Default: None.
  • \n
  • key_col: the name of the key column.
  • \n
  • expand_key: whether you want to expand the content inside the key\ncolumn or not. Default: false.
  • \n
  • expand_value: whether you want to expand the content inside the value\ncolumn or not. Default: true.
  • \n
\n\n
Returns:
\n\n
\n

Function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "

Convert a json string into a json column (struct).

\n\n

The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.

\n\n
Arguments:
\n\n
    \n
  • input_col: dict with columns and respective target names.
  • \n
  • schema_path: path to the StructType schema (spark schema).
  • \n
  • schema: dict with the StructType schema (spark schema).
  • \n
  • json_options: options to parse the json value.
  • \n
  • drop_all_cols: whether to drop all the input columns or not.\nDefaults to False.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False,\tdisable_dbfs_retry: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "

Convert dataframe columns into a json value.

\n\n
Arguments:
\n\n
    \n
  • in_cols: name(s) of the input column(s).\nExample values:\n\"*\" - all\ncolumns; \"my_col\" - one column named \"my_col\";\n\"my_col1, my_col2\" - two columns.
  • \n
  • out_col: name of the output column.
  • \n
  • json_options: options to parse the json value.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "

Condensers module.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "

Class containing all the functions to condensate data for later merges.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "

Condense Change Data Capture (CDC) based on record_mode strategy.

\n\n

This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.

\n\n
Arguments:
\n\n
    \n
  • business_key: The business key (logical primary key) of the data.
  • \n
  • ranking_key_desc: In this type of CDC condensation the data needs to be\nin descending order in a certain way, using columns specified in this\nparameter.
  • \n
  • ranking_key_asc: In this type of CDC condensation the data needs to be\nin ascending order in a certain way, using columns specified in\nthis parameter.
  • \n
  • record_mode_col: Name of the record mode input_col.
  • \n
  • valid_record_modes: Depending on the context, not all record modes may be\nconsidered for condensation. Use this parameter to skip those.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "

Condense data based on a simple group by + take latest mechanism.

\n\n
Arguments:
\n\n
    \n
  • group_key: list of column names to use in the group by.
  • \n
  • ranking_key: the data needs to be in descending order using columns\nspecified in this parameter.
  • \n
  • descending: if the ranking considers descending order or not. Defaults to\nTrue.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "

Custom transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "

Class representing a CustomTransformers.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "

Execute a custom transformation provided by the user.

\n\n

This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.

\n\n
\n\n
Attention!
\n\n

Please bear in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame,\nbecause it is how Spark's .transform method is able to chain the\ntransformations.

\n\n
\n\n

Example:

\n\n
\n
def my_custom_logic(df: DataFrame) -> DataFrame:\n
\n
\n\n
Arguments:
\n\n
    \n
  • custom_transformer: custom transformer function. A python function with all\nrequired pyspark logic provided by the user.
  • \n
\n\n
Returns:
\n\n
\n

Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.

\n
\n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "

Module with data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "

Class containing data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "

Mask specific columns using an hashing approach.

\n\n
Arguments:
\n\n
    \n
  • cols: list of column names to mask.
  • \n
  • approach: hashing approach. Defaults to 'SHA'. There's \"MURMUR3\" as well.
  • \n
  • num_bits: number of bits of the SHA approach. Only applies to SHA approach.
  • \n
  • suffix: suffix to apply to new column name. Defaults to \"_hash\".\nNote: you can pass an empty suffix to have the original column replaced.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "

Drop specific columns.

\n\n
Arguments:
\n\n
    \n
  • cols: list of column names to drop.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "

Module containing date transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "

Class with set of transformers to transform dates in several forms.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "

Add column with current date.

\n\n

The current date comes from the driver as a constant, not from every executor.

\n\n
Arguments:
\n\n
    \n
  • output_col: name of the output column.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "

Convert multiple string columns with a source format into dates.

\n\n
Arguments:
\n\n
    \n
  • cols: list of names of the string columns to convert.
  • \n
  • source_format: dates source format (e.g., YYYY-MM-dd). Check here.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "

Convert multiple string columns with a source format into timestamps.

\n\n
Arguments:
\n\n
    \n
  • cols: list of names of the string columns to convert.
  • \n
  • source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS).\nCheck here.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "

Convert multiple date/timestamp columns into strings with the target format.

\n\n
Arguments:
\n\n
    \n
  • cols: list of names of the string columns to convert.
  • \n
  • target_format: strings target format (e.g., YYYY-MM-dd). Check here.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "

Create day/month/week/quarter/year hierarchy for the provided date columns.

\n\n

Uses Spark's extract function.

\n\n
Arguments:
\n\n
    \n
  • cols: list of names of the date columns to create the hierarchy.
  • \n
  • formats: dict with the correspondence between the hierarchy and the format\nto apply. Check here.\nExample: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n}
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "

Module for all the transformers exceptions.

\n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "

Exception for when a user provides wrong arguments to a transformer.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "

Exception for when a user requests a transformer not supported in streaming.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "

Module containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "

Class containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "

Incrementally Filter a certain dataframe given an increment logic.

\n\n

This logic can either be an increment value or an increment dataframe from\nwhich the get the latest value from. By default, the operator for the\nfiltering process is greater or equal to cover cases where we receive late\narriving data not cover in a previous load. You can change greater_or_equal\nto false to use greater, when you trust the source will never output more data\nwith the increment after you have load the data (e.g., you will never load\ndata until the source is still dumping data, which may cause you to get an\nincomplete picture of the last arrived data).

\n\n
Arguments:
\n\n
    \n
  • input_col: input column name
  • \n
  • increment_value: value to which to filter the data, considering the\nprovided input_Col.
  • \n
  • increment_df: a dataframe to get the increment value from.\nyou either specify this or the increment_value (this takes precedence).\nThis is a good approach to get the latest value from a given dataframe\nthat was read and apply that value as filter here. In this way you can\nperform incremental loads based on the last value of a given dataframe\n(e.g., table or file based). Can be used together with the\nget_max_value transformer to accomplish these incremental based loads.\nSee our append load feature tests to see how to provide an acon for\nincremental loads, taking advantage of the scenario explained here.
  • \n
  • increment_col: name of the column from which to get the increment\nvalue from (when using increment_df approach). This assumes there's\nonly one row in the increment_df, reason why is a good idea to use\ntogether with the get_max_value transformer. Defaults to \"latest\"\nbecause that's the default output column name provided by the\nget_max_value transformer.
  • \n
  • greater_or_equal: if filtering should be done by also including the\nincrement value or not (useful for scenarios where you are performing\nincrement loads but still want to include data considering the increment\nvalue, and not only values greater than that increment... examples may\ninclude scenarios where you already loaded data including those values,\nbut the source produced more data containing those values).\nDefaults to false.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "

Filter a dataframe based on an expression.

\n\n
Arguments:
\n\n
    \n
  • exp: filter expression.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "

Filter a dataframe's columns based on a list of SQL expressions.

\n\n
Arguments:
\n\n
    \n
  • exp: column filter expressions.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "

Drop duplicate rows using spark function dropDuplicates().

\n\n

This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.

\n\n
Arguments:
\n\n
    \n
  • cols: column names.
  • \n
  • watermarker: properties to apply watermarker to the transformer.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "

Module with join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "

Class containing join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "

Join two dataframes based on specified type and columns.

\n\n

Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.

\n\n
Arguments:
\n\n
    \n
  • left_df_alias: alias of the first dataframe.
  • \n
  • join_with: right dataframe.
  • \n
  • right_df_alias: alias of the second dataframe.
  • \n
  • join_condition: condition to join dataframes.
  • \n
  • join_type: type of join. Defaults to inner.\nAvailable values: inner, cross, outer, full, full outer,\nleft, left outer, right, right outer, semi,\nleft semi, anti, and left anti.
  • \n
  • broadcast_join: whether to perform a broadcast join or not.
  • \n
  • select_cols: list of columns to select at the end.
  • \n
  • watermarker: properties to apply watermarking.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "

Module with null handlers transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "

Class containing null handler transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "

Replace nulls in a dataframe.

\n\n
Arguments:
\n\n
    \n
  • replace_on_nums: if it is to replace nulls on numeric columns.\nApplies to ints, longs and floats.
  • \n
  • default_num_value: default integer value to use as replacement.
  • \n
  • replace_on_strings: if it is to replace nulls on string columns.
  • \n
  • default_string_value: default string value to use as replacement.
  • \n
  • subset_cols: list of columns in which to replace nulls. If not\nprovided, all nulls in all columns will be replaced as specified.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "

Optimizers module.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "

Class containing all the functions that can provide optimizations.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "

Caches the current dataframe.

\n\n

The default storage level used is MEMORY_AND_DISK.

\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "

Caches the current dataframe with a specific StorageLevel.

\n\n
Arguments:
\n\n
    \n
  • storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.\nMore options here.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "

Removes the dataframe from the disk and memory.

\n\n
Arguments:
\n\n
    \n
  • blocking: whether to block until all the data blocks are\nremoved from disk/memory or run asynchronously.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "

Regex transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "

Class containing all regex functions.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "

Get the result of applying a regex to an input column (via regexp_extract).

\n\n
Arguments:
\n\n
    \n
  • input_col: name of the input column.
  • \n
  • output_col: name of the output column.
  • \n
  • regex: regular expression.
  • \n
  • drop_input_col: whether to drop input_col or not.
  • \n
  • idx: index to return.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed in the .transform() spark function.

\n
\n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "

Module with repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "

Class containing repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "

Coalesce a dataframe into n partitions.

\n\n
Arguments:
\n\n
    \n
  • num_partitions: num of partitions to coalesce.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "

Repartition a dataframe into n partitions.

\n\n

If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).

\n\n
Arguments:
\n\n
    \n
  • num_partitions: num of partitions to repartition.
  • \n
  • cols: list of columns to use for repartitioning.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "

Module with the factory pattern to return transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "

TransformerFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "

Get a transformer following the factory pattern.

\n\n
Arguments:
\n\n
    \n
  • spec: transformer specification (individual transformation... not to be\nconfused with list of all transformations).
  • \n
  • data: ordered dict of dataframes to be transformed. Needed when a\ntransformer requires more than one dataframe as input.
  • \n
\n\n
Returns:
\n\n
\n

Transformer function to be executed in .transform() spark function.

\n
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "

Module with union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "

Class containing union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "

Union dataframes, resolving columns by position (not by name).

\n\n
Arguments:
\n\n
    \n
  • union_with: list of dataframes to union.
  • \n
  • deduplication: whether to perform deduplication of elements or not.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "

Union dataframes, resolving columns by name (not by position).

\n\n
Arguments:
\n\n
    \n
  • union_with: list of dataframes to union.
  • \n
  • deduplication: whether to perform deduplication of elements or not.
  • \n
  • allow_missing_columns: allow the union of DataFrames with different\nschemas.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "

Watermarker module.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "

Class containing all watermarker transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "

Get the dataframe with watermarker defined.

\n\n
Arguments:
\n\n
    \n
  • watermarker_column: name of the input column to be considered for\nthe watermarking. Note: it must be a timestamp.
  • \n
  • watermarker_time: time window to define the watermark value.
  • \n
\n\n
Returns:
\n\n
\n

A function to be executed on other transformers.

\n
\n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "

Utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "

Config utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "

Module to read configurations.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "

Config utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "

Get acon based on a filesystem path or on a dict.

\n\n
Arguments:
\n\n
    \n
  • acon_path: path of the acon (algorithm configuration) file.
  • \n
  • acon: acon provided directly through python code (e.g., notebooks\nor other apps).
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Dict representation of an acon.

\n
\n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tdisable_dbfs_retry: bool = False) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "

Get the lakehouse engine configuration file.

\n\n
Returns:
\n\n
\n

Configuration dictionary

\n
\n", "signature": "(package: str = 'lakehouse_engine.configs') -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_engine_version", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_engine_version", "kind": "function", "doc": "

Get Lakehouse Engine version from the installed packages.

\n\n
Returns:
\n\n
\n

String of engine version.

\n
\n", "signature": "(cls) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "

Read an acon (algorithm configuration) file.

\n\n
Arguments:
\n\n
    \n
  • path: path to the acon file.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

The acon file content as a dict.

\n
\n", "signature": "(path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "

Read a DDL file in Spark SQL format from a cloud object storage system.

\n\n
Arguments:
\n\n
    \n
  • path: path to the SQL file.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Content of the SQL file.

\n
\n", "signature": "(path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.remove_sensitive_info", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.remove_sensitive_info", "kind": "function", "doc": "

Remove sensitive info from a dictionary.

\n\n
Arguments:
\n\n
    \n
  • dict_to_replace: dict where we want to remove sensitive info.
  • \n
\n\n
Returns:
\n\n
\n

dict without sensitive information.

\n
\n", "signature": "(cls, dict_to_replace: Union[dict, list]) -> Union[dict, list]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "

Utilities for databricks operations.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "

Databricks utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "

Get db utils on databricks.

\n\n
Arguments:
\n\n
    \n
  • spark: spark session.
  • \n
\n\n
Returns:
\n\n
\n

Dbutils from databricks.

\n
\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "

Get notebook context from running acon.

\n\n
Arguments:
\n\n
    \n
  • spark: spark session.
  • \n
\n\n
Returns:
\n\n
\n

Dict containing databricks notebook context.

\n
\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats", "modulename": "lakehouse_engine.utils.engine_usage_stats", "kind": "module", "doc": "

Utilities for recording the engine activity.

\n"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats", "kind": "class", "doc": "

Engine Usage utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats.store_engine_usage", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats.store_engine_usage", "kind": "function", "doc": "

Collects and store Lakehouse Engine usage statistics.

\n\n

These statistics include the acon and other relevant information, such as\nthe lakehouse engine version and the functions/algorithms being used.

\n\n
Arguments:
\n\n
    \n
  • acon: acon dictionary file.
  • \n
  • func_name: function name that called this log acon.
  • \n
  • collect_engine_usage: Lakehouse usage statistics collection strategy.
  • \n
  • spark_confs: optional dictionary with the spark confs to be used when\ncollecting the engine usage.
  • \n
\n", "signature": "(\tcls,\tacon: dict,\tfunc_name: str,\tcollect_engine_usage: str = None,\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats.get_spark_conf_values", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats.get_spark_conf_values", "kind": "function", "doc": "

Get information from spark session configurations.

\n\n
Arguments:
\n\n
    \n
  • usage_stats: usage_stats dictionary file.
  • \n
  • spark_confs: optional dictionary with the spark tags to be used when\ncollecting the engine usage.
  • \n
\n", "signature": "(cls, usage_stats: dict, spark_confs: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "

Utilities to be used by custom expectations.

\n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "

Validates the test results of the custom expectations.

\n\n

If you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.

\n\n
Arguments:
\n\n
    \n
  • expectation: Expectation to validate.
  • \n
  • configuration: Configuration used in the test.
  • \n
  • metrics: Test result metrics.
  • \n
  • partial_success: Result of validations done before calling this method.
  • \n
  • partial_result: Extra fields to be returned to the user.
  • \n
\n\n
Returns:
\n\n
\n

The result of the validation.

\n
\n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "

Extraction utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "

Utilities module for JDBC extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "

Standardize the types of extractions we can have from a JDBC source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from a JDBC source.

\n\n

These configurations cover:

\n\n
    \n
  • user: username to connect to JDBC source.
  • \n
  • password: password to connect to JDBC source (always use secrets,\ndon't use text passwords in your code).
  • \n
  • url: url to connect to JDBC source.
  • \n
  • dbtable: database.table to extract data from.
  • \n
  • calc_upper_bound_schema: custom schema used for the upper bound calculation.
  • \n
  • changelog_table: table of type changelog from which to extract data,\nwhen the extraction type is delta.
  • \n
  • partition_column: column used to split the extraction.
  • \n
  • latest_timestamp_data_location: data location (e.g., s3) containing the data\nto get the latest timestamp already loaded into bronze.
  • \n
  • latest_timestamp_data_format: the format of the dataset in\nlatest_timestamp_data_location. Default: delta.
  • \n
  • extraction_type: type of extraction (delta or init). Default: \"delta\".
  • \n
  • driver: JDBC driver name. Default: \"com.sap.db.jdbc.Driver\".
  • \n
  • num_partitions: number of Spark partitions to split the extraction.
  • \n
  • lower_bound: lower bound to decide the partition stride.
  • \n
  • upper_bound: upper bound to decide the partition stride. If\ncalculate_upper_bound is True, then upperBound will be\nderived by our upper bound optimizer, using the partition column.
  • \n
  • default_upper_bound: the value to use as default upper bound in case\nthe result of the upper bound calculation is None. Default: \"1\".
  • \n
  • fetch_size: how many rows to fetch per round trip. Default: \"100000\".
  • \n
  • compress: enable network compression. Default: True.
  • \n
  • custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
  • \n
  • min_timestamp: min timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the location provided.\nIn case this one is provided it has precedence and the calculation\nis not done.
  • \n
  • max_timestamp: max timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the table having information\nabout the extraction requests, their timestamps and their status.\nIn case this one is provided it has precedence and the calculation\nis not done.
  • \n
  • generate_predicates: whether to generate predicates automatically or not.\nDefault: False.
  • \n
  • predicates: list containing all values to partition (if generate_predicates\nis used, the manual values provided are ignored). Default: None.
  • \n
  • predicates_add_null: whether to consider null on predicates list.\nDefault: True.
  • \n
  • extraction_timestamp: the timestamp of the extraction. Default: current time\nfollowing the format \"%Y%m%d%H%M%S\".
  • \n
  • max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\nfrom the table holding the extraction requests information.
  • \n
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617164531',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "

Construct JDBCExtractionUtils.

\n\n
Arguments:
\n\n
    \n
  • jdbc_extraction: JDBC Extraction configurations. Can be of type:\nJDBCExtraction, SAPB4Extraction or SAPBWExtraction.
  • \n
\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "

Helper to get additional Spark Options initially passed.

\n\n

If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.

\n\n
Arguments:
\n\n
    \n
  • input_spec: the input specification.
  • \n
  • options: dict with Spark options.
  • \n
  • ignore_options: list of options to be ignored by the process.\nSpark read has two different approaches to parallelize\nreading process, one of them is using upper/lower bound,\nanother one is using predicates, those process can't be\nexecuted at the same time, you must choose one of them.\nBy choosing predicates you can't pass lower and upper bound,\nalso can't pass number of partitions and partition column\notherwise spark will interpret the execution partitioned by\nupper and lower bound and will expect to fill all variables.\nTo avoid fill all predicates hardcoded at the acon, there is\na feature that automatically generates all predicates for init\nor delta load based on input partition column, but at the end\nof the process, partition column can't be passed to the options,\nbecause we are choosing predicates execution, that is why to\ngenerate predicates we need to pass some options to ignore.
  • \n
\n\n
Returns:
\n\n
\n

a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).

\n
\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "

Get the predicates list, based on a predicates query.

\n\n
Arguments:
\n\n
    \n
  • predicates_query: query to use as the basis to get the distinct values for\na specified column, based on which predicates are generated.
  • \n
\n\n
Returns:
\n\n
\n

List containing the predicates to use to split the extraction from\n JDBC sources.

\n
\n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "

Get the Spark options to extract data from a JDBC source.

\n\n
Returns:
\n\n
\n

The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.

\n
\n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "

Get an optimal upperBound to properly split a Spark JDBC extraction.

\n\n
Returns:
\n\n
\n

Either an int, date or timestamp to serve as upperBound Spark JDBC option.

\n
\n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP B4 extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "

Standardise the types of ADSOs we can have for Extractions from SAP B4.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "

\n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP B4.

\n\n

It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:

\n\n
    \n
  • latest_timestamp_input_col: the column containing the request timestamps\nin the dataset in latest_timestamp_data_location. Default: REQTSN.
  • \n
  • request_status_tbl: the name of the SAP B4 table having information\nabout the extraction requests. Composed of database.table.\nDefault: SAPHANADB.RSPMREQUEST.
  • \n
  • request_col_name: name of the column having the request timestamp to join\nwith the request status table. Default: REQUEST_TSN.
  • \n
  • data_target: the data target to extract from. User in the join operation with\nthe request status table.
  • \n
  • act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'tbl.reqtsn = req.request_col_name'.
  • \n
  • include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not.
  • \n
  • extra_cols_req_status_tbl: columns to be added from request status table.\nIt needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\nreq.col2 as column_two\".
  • \n
  • request_status_tbl_filter: filter to use for filtering the request status table,\ninfluencing the calculation of the max timestamps and the delta extractions.
  • \n
  • adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".
  • \n
  • max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
  • \n
  • default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
  • \n
  • custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
  • \n
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617164531',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from SAP B4.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPB4ExtractionUtils.

\n\n
Arguments:
\n\n
    \n
  • sap_b4_extraction: SAP B4 Extraction configurations.
  • \n
\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "

Get the data_target from the data_target option or derive it.

\n\n

By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.

\n\n
Arguments:
\n\n
    \n
  • input_spec_opt: options from the input_spec.
  • \n
\n\n
Returns:
\n\n
\n

A string with the data_target.

\n
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP BW extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP BW.

\n\n

It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:

\n\n
    \n
  • latest_timestamp_input_col: the column containing the actrequest timestamp\nin the dataset in latest_timestamp_data_location. Default:\n\"actrequest_timestamp\".
  • \n
  • act_request_table: the name of the SAP BW activation requests table.\nComposed of database.table. Default: SAPPHA.RSODSACTREQ.
  • \n
  • request_col_name: name of the column having the request to join\nwith the activation request table. Default: actrequest.
  • \n
  • act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'changelog_tbl.request = act_req.request_col_name'.
  • \n
  • odsobject: name of BW Object, used for joining with the activation request\ntable to get the max actrequest_timestamp to consider while filtering\nthe changelog table.
  • \n
  • include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not. Default: True.
  • \n
  • extra_cols_act_request: list of columns to be added from act request table.\nIt needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\nas column_one, act_req.col2 as column_two\".
  • \n
  • get_timestamp_from_act_request: whether to get init timestamp\nfrom act request table or assume current/given timestamp.
  • \n
  • sap_bw_schema: sap bw schema. Default: SAPPHA.
  • \n
  • max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
  • \n
  • default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
  • \n
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617164531',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPBWExtractionUtils.

\n\n
Arguments:
\n\n
    \n
  • sap_bw_extraction: SAP BW Extraction configurations.
  • \n
\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "

Get the changelog table, given an odsobject.

\n\n
Returns:
\n\n
\n

String to use as changelog_table.

\n
\n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "

Get the odsobject based on the provided options.

\n\n

With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.

\n\n
Arguments:
\n\n
    \n
  • input_spec_opt: options from the input_spec.
  • \n
\n\n
Returns:
\n\n
\n

A string with the odsobject.

\n
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "

Utilities module for SFTP extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "

Standardize the types of filters we can have from a SFTP source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant SFTP sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "

Get a list of files to be extracted from SFTP.

\n\n

The arguments (options_args) to list files are:

\n\n
    \n
  • date_time_gt(str):\nFilter the files greater than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
  • \n
  • date_time_lt(str):\nFilter the files lower than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
  • \n
  • earliest_file(bool):\nFilter the earliest dated file in the directory.
  • \n
  • file_name_contains(str):\nFilter files when match the pattern.
  • \n
  • latest_file(bool):\nFilter the most recent dated file in the directory.
  • \n
  • sub_dir(bool):\nWhen true, the engine will search files into subdirectories\nof the remote_path.\nIt will consider one level below the remote_path.\nWhen sub_dir is used with latest_file/earliest_file argument,\nthe engine will retrieve the latest_file/earliest_file\nfor each subdirectory.
  • \n
\n\n
Arguments:
\n\n
    \n
  • sftp: the SFTP client object.
  • \n
  • remote_path: path of files to be filtered.
  • \n
  • options_args: options from the acon.
  • \n
\n\n
Returns:
\n\n
\n

A list containing the file names to be passed to Spark.

\n
\n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "

Get the SFTP client.

\n\n

The SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.

\n\n
Arguments:
\n\n
    \n
  • options_args: dictionary containing SFTP connection parameters.\nThe Paramiko arguments expected to connect are:

    \n\n
      \n
    • \"hostname\": the server to connect to.
    • \n
    • \"port\": the server port to connect to.
    • \n
    • \"username\": the username to authenticate as.
    • \n
    • \"password\": used for password authentication.
    • \n
    • \"pkey\": optional - an optional public key to use for\nauthentication.
    • \n
    • \"passphrase\" \u2013 optional - options used for decrypting private\nkeys.
    • \n
    • \"key_filename\" \u2013 optional - the filename, or list of filenames,\nof optional private key(s) and/or certs to try for\nauthentication.
    • \n
    • \"timeout\" \u2013 an optional timeout (in seconds) for the TCP connect.
    • \n
    • \"allow_agent\" \u2013 optional - set to False to disable\nconnecting to the SSH agent.
    • \n
    • \"look_for_keys\" \u2013 optional - set to False to disable searching\nfor discoverable private key files in ~/.ssh/.
    • \n
    • \"compress\" \u2013 optional - set to True to turn on compression.
    • \n
    • \"sock\" - optional - an open socket or socket-like object\nto use for communication to the target host.
    • \n
    • \"gss_auth\" \u2013 optional - True if you want to use GSS-API\nauthentication.
    • \n
    • \"gss_kex\" \u2013 optional - Perform GSS-API Key Exchange and\nuser authentication.
    • \n
    • \"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\ncredentials or not.
    • \n
    • \"gss_host\" \u2013 optional - The targets name in the kerberos database.
    • \n
    • \"gss_trust_dns\" \u2013 optional - Indicates whether or\nnot the DNS is trusted to securely canonicalize the name of the\nhost being connected to (default True).
    • \n
    • \"banner_timeout\" \u2013 an optional timeout (in seconds)\nto wait for the SSH banner to be presented.
    • \n
    • \"auth_timeout\" \u2013 an optional timeout (in seconds)\nto wait for an authentication response.
    • \n
    • \"disabled_algorithms\" \u2013 an optional dict passed directly to\nTransport and its keyword argument of the same name.
    • \n
    • \"transport_factory\" \u2013 an optional callable which is handed a\nsubset of the constructor arguments (primarily those related\nto the socket, GSS functionality, and algorithm selection)\nand generates a Transport instance to be used by this client.\nDefaults to Transport.__init__.
    • \n
    \n\n

    The parameter to specify the private key is expected to be in\nRSA format. Attempting a connection with a blank host key is\nnot allowed unless the argument \"add_auto_policy\" is explicitly\nset to True.

  • \n
\n\n
Returns:
\n\n
\n

sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.

\n
\n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "

Validate the file extension based on the format definitions.

\n\n
Arguments:
\n\n
    \n
  • files_format: a string containing the file extension.
  • \n
\n\n
Returns:
\n\n
\n

The string validated and formatted.

\n
\n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "

Validate the location. Add \"/\" in the case it does not exist.

\n\n
Arguments:
\n\n
    \n
  • location: file path.
  • \n
\n\n
Returns:
\n\n
\n

The location validated.

\n
\n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "

Utilities for file name based operations.

\n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "

Function to retrieve list of file names in a folder.

\n\n

This function filters by file type and removes the extension of the file name\nit returns.

\n\n
Arguments:
\n\n
    \n
  • path: path to the folder to list files
  • \n
  • file_type: type of the file to include in list
  • \n
  • exclude_regex: regex of file names to exclude
  • \n
\n\n
Returns:
\n\n
\n

A list of file names without file type.

\n
\n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils.get_directory_path", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_directory_path", "kind": "function", "doc": "

Add '/' to the end of the path of a directory.

\n\n
Arguments:
\n\n
    \n
  • path: directory to be processed
  • \n
\n\n
Returns:
\n\n
\n

Directory path stripped and with '/' at the end.

\n
\n", "signature": "(path: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils", "modulename": "lakehouse_engine.utils.gab_utils", "kind": "module", "doc": "

Module to define GAB Utility classes.

\n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils", "kind": "class", "doc": "

Class containing utility functions for GAB.

\n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.logger", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.logger", "kind": "function", "doc": "

Store the execution of each stage in the log events table.

\n\n
Arguments:
\n\n
    \n
  • run_start_time: execution start time.
  • \n
  • run_end_time: execution end time.
  • \n
  • start: use case start date.
  • \n
  • end: use case end date.
  • \n
  • query_id: gab configuration table use case identifier.
  • \n
  • query_label: gab configuration table use case name.
  • \n
  • cadence: cadence to process.
  • \n
  • stage_file_path: stage file path.
  • \n
  • query: query to execute.
  • \n
  • status: status of the query execution.
  • \n
  • error_message: error message if present.
  • \n
  • target_database: target database to write.
  • \n
\n", "signature": "(\tself,\trun_start_time: datetime.datetime,\trun_end_time: datetime.datetime,\tstart: str,\tend: str,\tquery_id: str,\tquery_label: str,\tcadence: str,\tstage_file_path: str,\tquery: str,\tstatus: str,\terror_message: Union[Exception, str],\ttarget_database: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_json_column_as_dict", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_json_column_as_dict", "kind": "function", "doc": "

Get JSON column as dictionary.

\n\n
Arguments:
\n\n
    \n
  • lookup_query_builder: gab configuration data.
  • \n
  • query_id: gab configuration table use case identifier.
  • \n
  • query_column: column to get as json.
  • \n
\n", "signature": "(\tcls,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\tquery_id: str,\tquery_column: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.extract_columns_from_mapping", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.extract_columns_from_mapping", "kind": "function", "doc": "

Extract and transform columns to SQL select statement.

\n\n
Arguments:
\n\n
    \n
  • columns: data to extract the columns.
  • \n
  • is_dimension: flag identifying if is a dimension or a metric.
  • \n
  • extract_column_without_alias: flag to inform if it's to extract columns\nwithout aliases.
  • \n
  • table_alias: name or alias from the source table.
  • \n
  • is_extracted_value_as_name: identify if the extracted value is the\ncolumn name.
  • \n
\n", "signature": "(\tcls,\tcolumns: dict,\tis_dimension: bool,\textract_column_without_alias: bool = False,\ttable_alias: Optional[str] = None,\tis_extracted_value_as_name: bool = True) -> Union[tuple[list[str], list[str]], list[str]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_cadence_configuration_at_end_date", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_cadence_configuration_at_end_date", "kind": "function", "doc": "

A dictionary that corresponds to the conclusion of a cadence.

\n\n

Any end date inputted by the user we check this end date is actually end of\n a cadence (YEAR, QUARTER, MONTH, WEEK).\nIf the user input is 2024-03-31 this is a month end and a quarter end that\n means any use cases configured as month or quarter need to be calculated.

\n\n
Arguments:
\n\n
    \n
  • end_date: base end date.
  • \n
\n", "signature": "(cls, end_date: datetime.datetime) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_reconciliation_cadences", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_reconciliation_cadences", "kind": "function", "doc": "

Get reconciliation cadences based on the use case configuration.

\n\n
Arguments:
\n\n
    \n
  • cadence: cadence to process.
  • \n
  • selected_reconciliation_window: configured use case reconciliation window.
  • \n
  • cadence_configuration_at_end_date: cadences to execute at the end date.
  • \n
  • rerun_flag: flag indicating if it's a rerun or a normal run.
  • \n
\n", "signature": "(\tself,\tcadence: str,\tselected_reconciliation_window: dict,\tcadence_configuration_at_end_date: dict,\trerun_flag: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.format_datetime_to_default", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.format_datetime_to_default", "kind": "function", "doc": "

Format datetime to GAB default format.

\n\n
Arguments:
\n\n
    \n
  • date_to_format: date to format.
  • \n
\n", "signature": "(cls, date_to_format: datetime.datetime) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils", "kind": "class", "doc": "

Class to extract a partition based in a date period.

\n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils.get_years", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils.get_years", "kind": "function", "doc": "

Return a list of distinct years from the input parameters.

\n\n
Arguments:
\n\n
    \n
  • start_date: start of the period.
  • \n
  • end_date: end of the period.
  • \n
\n", "signature": "(cls, start_date: str, end_date: str) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils.get_partition_condition", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils.get_partition_condition", "kind": "function", "doc": "

Return year,month and day partition statement from the input parameters.

\n\n
Arguments:
\n\n
    \n
  • start_date: start of the period.
  • \n
  • end_date: end of the period.
  • \n
\n", "signature": "(cls, start_date: str, end_date: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "

Module to configure project logging.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "

Logging filter to hide sensitive data from being shown in the logs.

\n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "

Hide sensitive information from being shown in the logs.

\n\n

Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).

\n\n
Arguments:
\n\n
    \n
  • record: the LogRecord event being logged.
  • \n
\n\n
Returns:
\n\n
\n

The transformed record to be logged.

\n
\n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "

Handle the logging of the lakehouse engine project.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "

Construct a LoggingHandler instance.

\n\n
Arguments:
\n\n
    \n
  • class_name: name of the class to be indicated in the logs.
  • \n
\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "

Get the _logger instance variable.

\n\n
Returns
\n\n
\n

the logger object.

\n
\n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "

Utilities to facilitate dataframe schema management.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "

Schema utils that help retrieve and manage schemas of dataframes.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "

Get a spark schema from a file (spark StructType json file) in a file system.

\n\n
Arguments:
\n\n
    \n
  • file_path: path of the file in a file system. Check here.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Spark schema struct type.

\n
\n", "signature": "(\tfile_path: str,\tdisable_dbfs_retry: bool = False) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "

Get a dict with the spark schema from a file in a file system.

\n\n
Arguments:
\n\n
    \n
  • file_path: path of the file in a file system. Check here.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Spark schema in a dict.

\n
\n", "signature": "(file_path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "

Get a spark schema from a dict.

\n\n
Arguments:
\n\n
    \n
  • struct_type: dict containing a spark schema structure. Check here.
  • \n
\n\n
Returns:
\n\n
\n

Spark schema struct type.

\n
\n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "

Get a spark schema from a table.

\n\n
Arguments:
\n\n
    \n
  • table: table name from which to inherit the schema.
  • \n
\n\n
Returns:
\n\n
\n

Spark schema struct type.

\n
\n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "

Get a spark schema from an input specification.

\n\n

This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.

\n\n
Arguments:
\n\n
    \n
  • input_spec: input specification.
  • \n
\n\n
Returns:
\n\n
\n

spark schema struct type.

\n
\n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "

Recursive method to flatten the schema of the dataframe.

\n\n
Arguments:
\n\n
    \n
  • schema: schema to be flattened.
  • \n
  • prefix: prefix of the struct to get the value for. Only relevant\nfor being used in the internal recursive logic.
  • \n
  • level: level of the depth in the schema being flattened. Only relevant\nfor being used in the internal recursive logic.
  • \n
  • max_level: level until which you want to flatten the schema. Default: None.
  • \n
  • shorten_names: whether to shorten the names of the prefixes of the fields\nbeing flattened or not. Default: False.
  • \n
  • alias: whether to define alias for the columns being flattened or\nnot. Default: True.
  • \n
  • num_chars: number of characters to consider when shortening the names of\nthe fields. Default: 7.
  • \n
  • ignore_cols: columns which you don't want to flatten. Default: None.
  • \n
\n\n
Returns:
\n\n
\n

A function to be called in .transform() spark function.

\n
\n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils", "modulename": "lakehouse_engine.utils.sql_parser_utils", "kind": "module", "doc": "

Module to parse sql files.

\n"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils.SQLParserUtils", "modulename": "lakehouse_engine.utils.sql_parser_utils", "qualname": "SQLParserUtils", "kind": "class", "doc": "

Parser utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils.SQLParserUtils.split_sql_commands", "modulename": "lakehouse_engine.utils.sql_parser_utils", "qualname": "SQLParserUtils.split_sql_commands", "kind": "function", "doc": "

Read the sql commands of a file to choose how to split them.

\n\n
Arguments:
\n\n
    \n
  • sql_commands: commands to be split.
  • \n
  • delimiter: delimiter to split the sql commands.
  • \n
  • advanced_parser: boolean to define if we need to use a complex split.
  • \n
\n\n
Returns:
\n\n
\n

List with the sql commands.

\n
\n", "signature": "(\tself,\tsql_commands: str,\tdelimiter: str,\tadvanced_parser: bool) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "

Utilities to interact with storage systems.

\n"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "kind": "module", "doc": "

Module to represent a DBFS file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage", "kind": "class", "doc": "

Class to represent a DBFS file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage.get_file_payload", "kind": "function", "doc": "

Get the content of a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
\n\n
Returns:
\n\n
\n

File payload/content.

\n
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
  • content: content to write into the file.
  • \n
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "

Module for abstract representation of a storage system holding files.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "

Abstract file storage class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
\n\n
Returns:
\n\n
\n

File payload/content.

\n
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
  • content: content to write into the file.
  • \n
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "

Module for common file storage functions.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "

Class for common file storage functions.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "

Read a json file.

\n\n

The file should be in a supported file system (e.g., s3, dbfs or\nlocal filesystem).

\n\n
Arguments:
\n\n
    \n
  • path: path to the json file.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Dict with json file content.

\n
\n", "signature": "(cls, path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_sql", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_sql", "kind": "function", "doc": "

Read a sql file.

\n\n

The file should be in a supported file system (e.g., s3, dbfs or local\nfilesystem).

\n\n
Arguments:
\n\n
    \n
  • path: path to the sql file.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n\n
Returns:
\n\n
\n

Content of the SQL file.

\n
\n", "signature": "(cls, path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.write_payload", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.write_payload", "kind": "function", "doc": "

Write payload into a file.

\n\n

The file should be in a supported file system (e.g., s3, dbfs or local\nfilesystem).

\n\n
Arguments:
\n\n
    \n
  • path: path to validate the file type.
  • \n
  • url: url of the file.
  • \n
  • content: content to write into the file.
  • \n
  • disable_dbfs_retry: optional flag to disable file storage dbfs.
  • \n
\n", "signature": "(\tcls,\tpath: str,\turl: urllib.parse.ParseResult,\tcontent: str,\tdisable_dbfs_retry: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.is_boto3_configured", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.is_boto3_configured", "kind": "function", "doc": "

Check if boto3 is able to locate credentials and properly configured.

\n\n

If boto3 is not properly configured, we might want to try a different reader.

\n", "signature": "() -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "

Module to represent a local file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "

Class to represent a local file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
\n\n
Returns:
\n\n
\n

file payload/content.

\n
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
  • content: content to write into the file.
  • \n
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "

Module to represent a s3 file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "

Class to represent a s3 file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "

Get the payload of a config file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
\n\n
Returns:
\n\n
\n

File payload/content.

\n
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n
Arguments:
\n\n
    \n
  • url: url of the file.
  • \n
  • content: content to write into the file.
  • \n
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine_usage", "modulename": "lakehouse_engine_usage", "kind": "module", "doc": "

How to use the Lakehouse Engine?

\n\n

Lakehouse engine usage examples for all the algorithms and other core functionalities.

\n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader", "modulename": "lakehouse_engine_usage.data_loader", "kind": "module", "doc": "

Data Loader

\n\n

How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?

\n\n

An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a\nconfiguration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the\nalgorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The\nACON is the configuration providing the behaviour of a lakehouse engine algorithm. You can check the algorithm code, and\nhow it interprets the ACON here.\nIn this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data\nengineering scenarios.\nCheck the underneath pages to find several ACON examples that cover many data extraction, transformation and loading scenarios.

\n\n

Overview of the Structure of the ACON file for DataLoads

\n\n

An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available\nspecifications are:

\n\n
    \n
  • Input specifications (input_specs): specify how to read data. This is a mandatory keyword.
  • \n
  • Transform specifications (transform_specs): specify how to transform data.
  • \n
  • Data quality specifications (dq_specs): specify how to execute the data quality process.
  • \n
  • Output specifications (output_specs): specify how to write data to the target. This is a mandatory keyword.
  • \n
  • Terminate specifications (terminate_specs): specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc.).
  • \n
  • Execution environment (exec_env): custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).
  • \n
\n\n

Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table.

\n\n
\n\n

spec_id is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification.

\n\n
\n\n
\n
from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "orders_bronze",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json",\n      "with_filepath": True,\n      "options": {\n        "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/",\n        "header": False,\n        "delimiter": "\\u005E",\n        "dateFormat": "yyyyMMdd"\n      },\n      "location": "s3://my-data-product-bucket/bronze/orders/"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "orders_bronze_with_extraction_date",\n      "input_id": "orders_bronze",\n      "transformers": [\n        {\n          "function": "with_row_id"\n        },\n        {\n          "function": "with_regex_value",\n          "args": {\n            "input_col": "lhe_extraction_filepath",\n            "output_col": "extraction_date",\n            "drop_input_col": True,\n            "regex": ".*WE_SO_SCL_(\\\\d+).csv"\n          }\n        }\n      ]\n    }\n  ],\n  "dq_specs": [\n    {\n      "spec_id": "check_orders_bronze_with_extraction_date",\n      "input_id": "orders_bronze_with_extraction_date",\n      "dq_type": "validator",\n      "result_sink_db_table": "my_database.my_table_dq_checks",\n      "fail_on_error": False,\n      "dq_functions": [\n        {\n          "dq_function": "expect_column_values_to_not_be_null",\n          "args": {\n            "column": "omnihub_locale_code"\n          }\n        },\n        {\n          "dq_function": "expect_column_unique_value_count_to_be_between",\n          "args": {\n            "column": "product_division",\n            "min_value": 10,\n            "max_value": 100\n          }\n        },\n        {\n          "dq_function": "expect_column_max_to_be_between",\n          "args": {\n            "column": "so_net_value",\n            "min_value": 10,\n            "max_value": 1000\n          }\n        },\n        {\n          "dq_function": "expect_column_value_lengths_to_be_between",\n          "args": {\n            "column": "omnihub_locale_code",\n            "min_value": 1,\n            "max_value": 10\n          }\n        },\n        {\n          "dq_function": "expect_column_mean_to_be_between",\n          "args": {\n            "column": "coupon_code",\n            "min_value": 15,\n            "max_value": 20\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "orders_silver",\n      "input_id": "check_orders_bronze_with_extraction_date",\n      "data_format": "delta",\n      "write_type": "merge",\n      "partitions": [\n        "order_date_header"\n      ],\n      "merge_opts": {\n        "merge_predicate": """\n            new.sales_order_header = current.sales_order_header\n            and new.sales_order_schedule = current.sales_order_schedule\n            and new.sales_order_item=current.sales_order_item\n            and new.epoch_status=current.epoch_status\n            and new.changed_on=current.changed_on\n            and new.extraction_date=current.extraction_date\n            and new.lhe_batch_id=current.lhe_batch_id\n            and new.lhe_row_id=current.lhe_row_id\n        """,\n        "insert_only": True\n      },\n      "db_table": "my_database.my_table_with_dq",\n      "location": "s3://my-data-product-bucket/silver/order_events_with_dq/",\n      "with_batch_id": True,\n      "options": {\n        "checkpointLocation": "s3://my-data-product-bucket/checkpoints/order_events_with_dq/"\n      }\n    }\n  ],\n  "terminate_specs": [\n    {\n      "function": "optimize_dataset",\n      "args": {\n        "db_table": "my_database.my_table_with_dq"\n      }\n    }\n  ],\n  "exec_env": {\n    "spark.databricks.delta.schema.autoMerge.enabled": True\n  }\n}\n\nload_data(acon=acon)\n
\n
\n\n

Input Specifications

\n\n

You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that\nlist, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver,\ngold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g.,\njoins or incremental filtering on one dataset based on the values of another\none), therefore you can use one or more elements.\nMore information about InputSpecs.

\n\n
Relevant notes
\n\n
    \n
  • A spec id is fundamental, so you can use the input data later on in any step of the algorithm (transform, write, dq process, terminate).
  • \n
  • You don't have to specify db_table and location at the same time. Depending on the data_format sometimes you read from a table (e.g., jdbc or deltalake table) sometimes you read from a location (e.g., files like deltalake, parquet, json, avro... or kafka topic).
  • \n
\n\n

Transform Specifications

\n\n

In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input.

\n\n

If you look into the example above we ask the lakehouse engine to execute two functions on the orders_bronze input\ndata: with_row_id and with_regex_value. Those functions can of course receive arguments. You can see a list of all\navailable transformation functions (transformers) here lakehouse_engine.transformers. Then, you just invoke them in\nyour ACON as demonstrated above, following exactly the same function name and parameters name as described in the code\ndocumentation. \nMore information about TransformSpec.

\n\n
Relevant notes
\n\n
    \n
  • This stage is fully optional, you can omit it from the ACON.
  • \n
  • There is one relevant option force_streaming_foreach_batch_processing that can be used to force the transform to be\nexecuted in the foreachBatch function to ensure non-supported streaming operations can be properly executed. You don't\nhave to worry about this if you are using regular lakehouse engine transformers. But if you are providing your custom\nlogic in pyspark code via our lakehouse engine\ncustom_transformation (lakehouse_engine.transformers.custom_transformers) then sometimes your logic may contain\nSpark functions that are not compatible with Spark Streaming, and therefore this flag can enable all of your\ncomputation to be streaming-compatible by pushing down all the logic into the foreachBatch() function.
  • \n
\n\n

Data Quality Specifications

\n\n

One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you\nfrom loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process\nincludes one main feature at the moment:

\n\n
    \n
  • Validator: The capability to perform data quality checks on that data (e.g., is the max value of a column bigger\nthan x?) and even tag your data with the results of the DQ checks.
  • \n
\n\n

The output of the data quality process can be written into a Result Sink target (e.g. table or files) and is integrated with a Data Docs website, which can be a company-wide available website for people to check the quality of their data and share with others.

\n\n

To achieve all of this functionality the lakehouse engine uses Great Expectations internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above).

\n\n
\n\n

The names of the functions and args are a 1 to 1 match of Great Expectations API.

\n\n
\n\n

More information about DQSpec.

\n\n
Relevant notes
\n\n
    \n
  • You can write the outputs of the DQ process to a sink through the result_sink* parameters of the\nDQSpec. result_sink_options takes any Spark options for a DataFrame writer, which means you can specify the options\naccording to your sink format (e.g., delta, parquet, json, etc.). We usually recommend using \"delta\" as format.
  • \n
  • You can use the results of the DQ checks to tag the data that you are validating. When configured, these details will\nappear as a new column (like any other), as part of the tables of your Data Product.
  • \n
  • To be able to make an analysis with the data of result_sink*, we have available an approach in which you\nset result_sink_explode as true (which is the default) and then you have some columns expanded. Those are:\n
      \n
    • General columns: Those are columns that have the basic information regarding dq_specs and will have always values\nand does not depend on the expectation types chosen.\n -\nColumns: checkpoint_config, run_name, run_time, run_results, success, validation_result_identifier, spec_id, input_id, validation_results, run_time_year, run_time_month, run_time_day.
    • \n
    • Statistics columns: Those are columns that have information about the runs of expectations, being those values for\nthe run and not for each expectation. Those columns come from run_results.validation_result.statistics.*.\n
        \n
      • Columns: evaluated_expectations, success_percent, successful_expectations, unsuccessful_expectations.
      • \n
    • \n
    • Expectations columns: Those are columns that have information about the expectation executed.\n
        \n
      • Columns: expectation_type, batch_id, expectation_success, exception_info. Those columns are exploded\nfrom run_results.validation_result.results\ninside expectation_config.expectation_type, expectation_config.kwargs.batch_id, success as expectation_success,\nand exception_info. Moreover, we also include unexpected_index_list, observed_value and kwargs.
      • \n
    • \n
    • Arguments of Expectations columns: Those are columns that will depend on the expectation_type selected. Those\ncolumns are exploded from run_results.validation_result.results inside expectation_config.kwargs.*.\n
        \n
      • We can have for\nexample: column, column_A, column_B, max_value, min_value, value, value_pairs_set, value_set,\nand others.
      • \n
    • \n
    • More columns desired? Those can be added, using result_sink_extra_columns in which you can select columns\nlike <name> and/or explode columns like <name>.*.
    • \n
  • \n
  • Use the parameter \"source\" to identify the data used for an easier analysis.
  • \n
  • By default, Great Expectation will also provide a site presenting the history of the DQ validations that you have performed on your data.
  • \n
  • You can make an analysis of all your expectations and create a dashboard aggregating all that information.
  • \n
  • This stage is fully optional, you can omit it from the ACON.
  • \n
\n\n

Output Specifications

\n\n

The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). More information about OutputSpec.

\n\n
Relevant notes
\n\n
    \n
  • Respect the supported write types and output formats.
  • \n
  • One of the most relevant options to specify in the options parameter is the checkpoint_location when in streaming\nread mode, because that location will be responsible for storing which data you already read and transformed from the\nsource, when the source is a Spark Streaming compatible source (e.g., Kafka or S3 files).
  • \n
\n\n

Terminate Specifications

\n\n

The terminate_specs section of the ACON is responsible for some \"wrapping up\" activities like optimising a table,\nvacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g.,\nreconciliation processes), but for now we have the following terminators.\nThis stage is fully optional, you can omit it from the ACON.\nThe most relevant now in the context of the lakehouse initiative are the following:

\n\n\n\n

More information about TerminatorSpec.

\n\n

Execution Environment

\n\n

In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the\nexecution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no\ncustom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON.

\n\n
\n\n

Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already\nrunning need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section.\nThis section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make\nsure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties\nto be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a\nconfig that can only be set up before the cluster is running.

\n\n
\n"}, {"fullname": "lakehouse_engine_usage.data_loader.append_load_from_jdbc_with_permissive_mode", "modulename": "lakehouse_engine_usage.data_loader.append_load_from_jdbc_with_permissive_mode", "kind": "module", "doc": "

Append Load from JDBC with PERMISSIVE mode (default)

\n\n

This scenario is an append load from a JDBC source (e.g., SAP BW, Oracle Database, SQL Server Database...).

\n\n
\n
from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "jdbc_args": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db",\n        "table": "jdbc_permissive",\n        "properties": {\n          "driver": "org.sqlite.JDBC"\n        }\n      },\n      "options": {\n        "numPartitions": 1\n      }\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "db_table": "test_db.jdbc_permissive_table"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "date"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "appended_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "date",\n            "increment_df": "max_sales_bronze_date"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "appended_sales",\n      "write_type": "append",\n      "db_table": "test_db.jdbc_permissive_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
\n
\n\n
Relevant notes
\n\n
    \n
  • The ReadMode is PERMISSIVE in this scenario, which is the default in Spark, hence we don't need to specify it. Permissive means don't enforce any schema on the input data.
  • \n
  • From a JDBC source the ReadType needs to be \"batch\" always as \"streaming\" is not available for a JDBC source.
  • \n
  • In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
  • \n
\n"}, {"fullname": "lakehouse_engine_usage.data_loader.append_load_with_failfast", "modulename": "lakehouse_engine_usage.data_loader.append_load_with_failfast", "kind": "module", "doc": "

Append Load with FAILFAST

\n\n

This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified.

\n\n
\n
from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "enforce_schema_from_table": "test_db.failfast_table",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "mode": "FAILFAST"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "db_table": "test_db.failfast_table"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "date"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "appended_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "date",\n            "increment_df": "max_sales_bronze_date"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "appended_sales",\n      "write_type": "append",\n      "db_table": "test_db.failfast_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
\n
\n\n
Relevant notes
\n\n
    \n
  • The ReadMode is FAILFAST in this scenario, i.e., fail the algorithm if the schema of the input data does not match the one we specified via schema_path, read_schema_from_table or schema Input_specs variables.
  • \n
  • In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
  • \n
\n"}, {"fullname": "lakehouse_engine_usage.data_loader.batch_delta_load_init_delta_backfill_with_merge", "modulename": "lakehouse_engine_usage.data_loader.batch_delta_load_init_delta_backfill_with_merge", "kind": "module", "doc": "

Batch Delta Load Init, Delta and Backfill with Merge

\n\n

This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed.

\n\n

Init Load

\n\n
\n
from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
\n
\n\n
Relevant Notes
\n\n
    \n
  • We can see that even though this is an init load we still have chosen to condense the records through our \"condense_record_mode_cdc\" transformer. This is a condensation step capable of handling SAP BW style changelogs based on actrequest_timestamps, datapakid, record_mode, etc...
  • \n
  • In the init load we actually did a merge in this case because we wanted to test locally if a merge with an empty target table works, but you don't have to do it, as an init load usually can be just a full load. If a merge of init data with an empty table has any performance implications when compared to a regular insert remains to be tested, but we don't have any reason to recommend a merge over an insert for an init load, and as said, this was done solely for local testing purposes, you can just use write_type: \"overwrite\"
  • \n
\n\n

Delta Load

\n\n
\n
from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_timestamp",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "actrequest_timestamp"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "actrequest_timestamp",\n            "increment_df": "max_sales_bronze_timestamp"\n          }\n        },\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
\n
\n\n
Relevant Notes
\n\n
    \n
  • The merge predicate and the insert, delete or update predicates should reflect the reality of your data, and it's up to each data product to figure out which predicates better match their reality:

    \n\n
      \n
    • The merge predicate usually involves making sure that the \"primary key\" for your data matches.\n
      \n
      Performance Tip!!! Ideally, in order to get a performance boost in your merges, you should also place a filter in your merge predicate (e.g., certain technical or business date in the target table >= x days ago), based on the assumption that the rows in that specified interval will never change in the future. This can drastically decrease the merge times of big tables.
    • \n
    \n\n
\n\n
    \n\n

  • The insert, delete and update predicates will always depend on the structure of your changelog, and also how you expect your updates to arrive (e.g., in certain data products you know that you will never get out of order data or late arriving data, while in other you can never ensure that). These predicates should reflect that in order to prevent you from doing unwanted changes to the target delta lake table.

    \n\n
      \n
    • For example, in this scenario, we delete rows that have the R, D or X record_mode values, because we know that if after condensing the rows that is the latest status of that row from the changelog, they should be deleted, and we never insert rows with those status (note: we use this guardrail in the insert to prevent out of order changes, which is likely not the case in SAP BW).
    • \n
    • Because the insert_predicate is fully optional, in your scenario you may not require that.
    • \n
  • \n\n

  • In this scenario, we don't pass an update_predicate in the ACON, because both insert_predicate and update_predicate are fully optional, i.e., if you don't pass them the algorithm will update any data that matches the merge_predicate and insert any data that does not match it. The predicates in these cases just make sure the algorithm does not insert or update any data that you don't want, as in the late arriving changes scenario where a deleted row may arrive first from the changelog then the update row, and to prevent your target table to have inconsistent data for a certain period of time (it will eventually get consistent when you receive the latest correct status from the changelog though) you can have this guardrail in the insert or update predicates. Again, for most sources this will not happen but sources like Kafka for example cannot 100% ensure order, for example.
  • \n
  • In order to understand how we can cover different scenarios (e.g., late arriving changes, out of order changes, etc.), please go here.
  • \n

\n
  • The order of the predicates in the ACON does not matter, is the logic in the lakehouse engine DeltaMergeWriter's \"_merge\" function that matters.
  • \n
  • Notice the \"<=>\" operator? In Spark SQL that's the null safe equal.
  • \n\n\n

    Backfilling

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_timestamp",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "actrequest_timestamp"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "actrequest_timestamp",\n            "increment_value": "20180110120052t",\n            "greater_or_equal": True\n          }\n        },\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant Notes
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.custom_transformer", "modulename": "lakehouse_engine_usage.data_loader.custom_transformer", "kind": "module", "doc": "

    Custom Transformer

    \n\n

    There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases.

    \n\n

    Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable.

    \n\n
    \n\n
    Force Streaming Micro Batch Processing.
    \n\n

    When you use streaming mode, with a custom transformer, it\u2019s\nhighly advisable that you set the force_streaming_microbatch_processing flag to True in the transform specification, as\nexplained above!

    \n\n
    \n\n

    What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic?

    \n\n

    We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source.\nThe low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source.

    \n\n

    However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy\non transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have\nget into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based\napproach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that\nallows us to pass custom transformers where you put your entire pyspark logic and can pass it as an argument\nin the ACON (the configuration file that configures every lakehouse engine algorithm).

    \n\n

    Motivation:

    \n\n

    Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :)

    \n\n

    Custom transformation Function

    \n\n

    The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine

    \n\n
    \n\n
    Attention!!!
    \n\n

    For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like python partials, for example.

    \n\n
    \n\n
    \n
    def get_new_data(df: DataFrame) -> DataFrame:\n    """Get the new data from the lakehouse engine reader and prepare it."""\n    return (\n        df.withColumn("amount", when(col("_change_type") == "delete", lit(0)).otherwise(col("amount")))\n        .select("article_id", "order_date", "amount")\n        .groupBy("article_id", "order_date")\n        .agg(sum("amount").alias("amount"))\n    )\n\n\ndef get_joined_data(new_data_df: DataFrame, current_data_df: DataFrame) -> DataFrame:\n    """Join the new data with the current data already existing in the target dataset."""\n    return (\n        new_data_df.alias("new_data")\n        .join(\n            current_data_df.alias("current_data"),\n            [\n                new_data_df.article_id == current_data_df.article_id,\n                new_data_df.order_date == current_data_df.order_date,\n            ],\n            "left_outer",\n        )\n        .withColumn(\n            "current_amount", when(col("current_data.amount").isNull(), lit(0)).otherwise("current_data.amount")\n        )\n        .withColumn("final_amount", col("current_amount") + col("new_data.amount"))\n        .select(col("new_data.article_id"), col("new_data.order_date"), col("final_amount").alias("amount"))\n    )\n\n\ndef calculate_kpi(df: DataFrame) -> DataFrame:\n    """Calculate KPI through a custom transformer that will be provided in the ACON.\n\n    Args:\n        df: DataFrame passed as input.\n\n    Returns:\n        DataFrame: the transformed DataFrame.\n    """\n    new_data_df = get_new_data(df)\n\n    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n    current_data_df = ExecEnv.SESSION.table(\n        "my_database.my_table"\n    )\n\n    transformed_df = get_joined_data(new_data_df, current_data_df)\n\n    return transformed_df\n
    \n
    \n\n

    Don't like pyspark API? Write SQL

    \n\n

    You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of\nthe auxiliary functions you decide to develop) you can write something like:

    \n\n
    \n
    def calculate_kpi(df: DataFrame) -> DataFrame:\n    df.createOrReplaceTempView("new_data")\n\n    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n    ExecEnv.SESSION.sql(\n        """\n          CREATE OR REPLACE TEMP VIEW my_kpi AS\n          SELECT ... FROM new_data ...\n        """\n    )\n\n    return ExecEnv.SESSION.table("my_kpi")\n
    \n
    \n\n

    Just your regular ACON

    \n\n

    If you notice the ACON below, everything is the same as you would do in a Data Product, but the transform_specs section of the ACON has a difference, which is a function called \"custom_transformation\" where we supply as argument the function defined above with the pyspark code.

    \n\n
    \n\n
    Attention!!!
    \n\n

    Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "sales",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "db_table": "my_database.dummy_sales",\n            "options": {"readChangeFeed": "true"},\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "transformed_sales_kpi",\n            "input_id": "sales",\n            # because we are using streaming, this allows us to make sure that\n            # all the computation in our custom transformer gets pushed to\n            # Spark's foreachBatch method in a stream, which allows us to\n            # run all Spark functions in a micro batch DataFrame, as there\n            # are some Spark functions that are not supported in streaming.\n            "force_streaming_foreach_batch_processing": True,\n            "transformers": [\n                {\n                    "function": "custom_transformation",\n                    "args": {"custom_transformer": calculate_kpi},\n                },\n            ],\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "my_table_quality",\n            "input_id": "transformed_sales_kpi",\n            "dq_type": "validator",\n            "bucket": "my_dq_bucket",\n            "data_docs_bucket": "my_data_product_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "expectations_store_prefix": "dq/expectations/",\n            "validations_store_prefix": "dq/validations/",\n            "checkpoint_store_prefix": "dq/checkpoints/",\n            "tbl_to_derive_pk": "my_table",\n            "dq_functions": [\n                {"function": "expect_column_values_to_not_be_null", "args": {"column": "article_id"}},\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "sales_kpi",\n            "input_id": "transformed_sales_kpi",\n            "write_type": "merge",\n            "data_format": "delta",\n            "db_table": "my_database.my_table",\n            "options": {\n                "checkpointLocation": "s3://my_data_product_bucket/gold/my_table",\n            },\n            "merge_opts": {\n                "merge_predicate": "new.article_id = current.article_id AND new.order_date = current.order_date"\n            },\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sap_b4_adso", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sap_b4_adso", "kind": "module", "doc": "

    Extract from SAP B4 ADSOs

    \n\n

    A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from\nSAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions\n(AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...),\nonly requiring a few parameters that are explained and exemplified in the\ntemplate scenarios that we have created.

    \n\n
    \n\n
    This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example.
    \n\n
    \n\n

    How can I find a good candidate column for partitioning the extraction from S4Hana?

    \n\n
    \n\n
    Parallelization Limitations
    \n\n

    There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source.

    \n\n

    E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source.

    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n
    \n\n
    In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).
    \n\n

    In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those.

    \n\n
    \n\n

    You can check the code documentation of the reader below:

    \n\n

    SAP B4 Reader

    \n\n

    JDBC Extractions arguments

    \n\n

    SAP B4 Extractions arguments

    \n\n
    \n\n
    For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well.
    \n\n
    \n\n

    Extraction from SAP B4 ADSOs Template

    \n\n

    This template covers the following scenarios of extractions from the SAP B4Hana ADSOs:

    \n\n\n\n
    \n\n

    Note: the template will cover two ADSO Types:

    \n\n
      \n
    • AQ: ADSO which is of append type and for which a single ADSO/tables holds all the information, like an\nevent table. For this type, the same ADSO is used for reading data both for the inits and deltas. Usually, these\nADSOs end with the digit \"6\".
    • \n
    • CL: ADSO which is split into two ADSOs, one holding the change log events, the other having the active\ndata (current version of the truth for a particular source). For this type, the ADSO having the active data\nis used for the first extraction (init) and the change log ADSO is used for the subsequent extractions (deltas).\nUsually, these ADSOs are split into active table ending with the digit \"2\" and changelog table ending with digit \"3\".
    • \n
    \n\n
    \n\n

    For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic\nbasically consists of joining the db_table (for AQ) or the changelog_table (for CL) with the table\nhaving the requests status (my_database.requests_status_table).\nOne of the fields used for this joining is the data_target, which has a relationship with the ADSO\n(db_table/changelog_table), being basically the same identifier without considering parts of it.

    \n\n

    Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to\n(this is a simplified version, for more details please refer to the lakehouse-engine code documentation):\nAQ Init Extraction:\nSELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table t

    \n\n

    AQ Delta Extraction:\nSELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table

    \n\n

    CL Init Extraction:\nSELECT t.*,\n {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,\n '0' AS datapakid,\n 0 AS record,\n CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_2 t

    \n\n

    CL Delta Extraction:\nSELECT tbl.*,\nCAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_3 AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`

    \n\n\n\n

    1 - The Simplest Scenario (Not parallel - Not Recommended)

    \n\n

    This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the ADSO\nyou want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source ADSO, there are two options:

    \n\n\n\n

    Below example is composed of two cells.

    \n\n\n\n
    \n\n

    There may be cases where you might want to always extract fully from the source ADSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "adso_type": "AQ",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented.

    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.

    \n\n

    On the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_sap_b4_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_simple/",\n                "adso_type": "AQ",\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_simple/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.

    \n\n

    Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_prov_upper/",\n                "adso_type": "AQ",\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 1000000,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_prov_upper/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction, Automatic upper_bound (Recommended)

    \n\n

    This scenario is very similar to 2.2, the only difference being that upperBound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "calculate_upper_bound": True,\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_calc_upper/",\n                "adso_type": "AQ",\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_calc_upper/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.4 - Parallel Extraction, Provide Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type CL,\nthe active table does not have the RECORD column, which is usually a good option for scenarios 2.2 and 2.3):

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenario 2.2 or 2.3.

    \n\n

    When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.

    \n\n

    Below the lakehouse function to generate predicate list automatically is presented.

    \n\n

    This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here, specially if you are applying filter on transformations spec,\nand you know entire table won't be necessary, so you can change it to something like this: select distinct(x)\nfrom table where x > y.

    \n\n

    predicates_add_null: You can decide if you want to consider null on predicates list or not, by default\nthis property is True.

    \n\n

    Example: for \"partition_column\": \"CALMONTH\"

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column = "CALMONTH"\ndbtable = "my_database.my_table_3"\n\npredicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partition_column,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_2_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "driver": "com.sap.db.jdbc.Driver",\n                "dbtable": "my_database.my_table_2",\n                "changelog_table": "my_database.my_table_3",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_prov_predicates/",\n                "adso_type": "CL",\n                "predicates": predicates,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_2_bronze",\n            "input_id": "my_identifier_2_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_2_prov_predicates/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.5 - Parallel Extraction, Generate Predicates

    \n\n

    This scenario is very similar to the scenario 2.4, with the only difference that it automatically\ngenerates the predicates (\"generate_predicates\": True).

    \n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise \nthose would probably be recommended).

    \n\n

    When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumn LOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

    \n\n

    Example: for \"partitionColumn\": \"record\"\nGenerate predicates:

    \n\n\n\n

    Spark will generate 100 queries like this:

    \n\n\n\n

    Generate predicates will also consider null by default:

    \n\n\n\n

    To disable this behaviour the following variable value should be changed to false: \"predicates_add_null\": False

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_2_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "generate_predicates": True,\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "driver": "com.sap.db.jdbc.Driver",\n                "dbtable": "my_database.my_table_2",\n                "changelog_table": "my_database.my_table_3",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_gen_predicates/",\n                "adso_type": "CL",\n                "partitionColumn": "CALMONTH",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_2_bronze",\n            "input_id": "my_identifier_2_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_2_gen_predicates/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sap_bw_dso", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sap_bw_dso", "kind": "module", "doc": "

    Extract from SAP BW DSOs

    \n\n
    \n\n
    Parallelization Limitations
    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from \nSAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions \n(active table, changelog table, activation requests table, how to identify the next delta timestamp...), \nonly requiring a few parameters that are explained and exemplified in the \ntemplate scenarios that we have created.

    \n\n

    This page also provides you a section to help you figure out a good candidate for partitioning the extraction from SAP BW.

    \n\n

    You can check the code documentation of the reader below:

    \n\n

    SAP BW Reader

    \n\n

    JDBC Extractions arguments

    \n\n

    SAP BW Extractions arguments

    \n\n
    \n\n

    For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also \nthe ones listed in the JDBC extractions, as those are inherited as well.

    \n\n
    \n\n

    Extraction from SAP-BW template

    \n\n

    This template covers the following scenarios of extractions from the SAP BW DSOs:

    \n\n\n\n\n\n

    1 - The Simplest Scenario (Not parallel - Not Recommended)

    \n\n

    This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques
    \nand using a single connection to retrieve all the data from the source. It should only be used in case the DSO \nyou want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source DSO, there are two options:

    \n\n\n\n

    Below example is composed of two cells.

    \n\n\n\n
    \n\n

    There may be cases where you might want to always extract fully from the source DSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs.

    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people does not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential. \nOn the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the example 1.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using\nthe following options:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.

    \n\n

    Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 3,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n                "upperBound": 42,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction, Automatic upper_bound (Recommended)

    \n\n

    This scenario is very similar to 2.2, the only difference being that upper_bound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "calculate_upper_bound": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 10,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.4 - Parallel Extraction, Backfilling

    \n\n

    This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and\nmade available in the bronze layer. By default, the delta extraction considers the max value of the column\nactrequest_timestamp on the data already extracted. However, there might be cases, in which you might want\nto extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you\ncan use the properties min_timestamp and max_timestamp.

    \n\n

    Below, a very similar example to the previous one is provided, the only differences being that\nthe properties \"min_timestamp\": \"20210910000000\" and \"max_timestamp\": \"20210913235959\" are not provided,\nmeaning it will extract the data from the changelog table, using a filter\n\"20210910000000\" > actrequest_timestamp <= \"20210913235959\", ignoring if some of the data is already\navailable in the destination or not. Moreover, note that the property latest_timestamp_data_location\ndoes not need to be provided, as the timestamps to be considered are being directly provided (if both\nthe timestamps and the latest_timestamp_data_location are provided, the last parameter will have no effect).\nAdditionally, \"extraction_type\": \"delta\" and \"write_type\": \"append\" is forced, instead of using the\nvariables as in the other examples, because the backfilling scenario only makes sense for delta extractions.

    \n\n
    \n\n

    Note: be aware that the backfilling example being shown has no mechanism to enforce that\nyou don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve\nany duplication in the silver layer or extract the delta with a merge strategy while writing to bronze,\ninstead of appending.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "calculate_upper_bound": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "extraction_type": "delta",\n                "numPartitions": 10,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n                "min_timestamp": "20210910000000",\n                "max_timestamp": "20210913235959",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.5 - Parallel Extraction, Provide Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).

    \n\n

    When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.

    \n\n

    Below the lakehouse function to generate predicate list automatically is presented.

    \n\n

    This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here,\nspecially if you are applying filter on transformations spec, and you know entire table won't be necessary, so\nyou can change it to something like this: select distinct(x) from table where x > y.

    \n\n

    predicates_add_null: You can decide if you want to consider null on predicates list or not, by default this\nproperty is True.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column = "my_partition_column"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\ncolumn_for_predicates = partition_column\nuser = "my_user"\npassword = "my_hana_pwd"\nurl = "my_bw_url"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        dbtable=dbtable,\n        partition_column=partition_column,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "predicates": predicates,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.6 - Parallel Extraction, Generate Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).

    \n\n

    When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumn LOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

    \n\n

    Example: for \"partitionColumn\": \"record\"\nGenerate predicates:

    \n\n\n\n

    Spark will generate 100 queries like this:

    \n\n\n\n

    Generate predicates will also consider null by default:

    \n\n\n\n

    To disable this behaviour the following variable value should be changed to false: \"predicates_add_null\": False

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "generate_predicates": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "partitionColumn": "my_partition_col",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    3 - Extraction from Write Optimized DSOs

    \n\n

    This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from\nWrite Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate\nchangelog table. Due to this reason, you need to specify that the changelog_table parameter value is equal\nto the dbtable parameter value.\nMoreover, these tables usually already include the changelog technical columns\nlike RECORD and DATAPAKID, for example, that the framework adds by default. Thus, you need to specify\n\"include_changelog_tech_cols\": False to change this behaviour.\nFinally, you also need to specify the name of the column in the table that can be used to join with the\nactivation requests table to get the timestamp of the several requests/deltas,\nwhich is \"actrequest\" by default (\"request_col_name\": 'request').

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "changelog_table": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "request_col_name": "request",\n                "include_changelog_tech_cols": False,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 2,\n                "partitionColumn": "RECORD",\n                "lowerBound": 1,\n                "upperBound": 50000,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table

    \n\n

    By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the\ncurrent timestamp) in the init extraction, however this may be causing problems when merging changes in silver,\nfor write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the\nact_req_table was added.

    \n\n

    This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to\nassume the value from the activation requests table (timestamp column).

    \n\n

    This feature is only available for WODSOs and to use it you need to specify \"get_timestamp_from_actrequest\": True.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "changelog_table": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "request_col_name": "request",\n                "include_changelog_tech_cols": False,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP/",\n                "extraction_type": "init",\n                "numPartitions": 2,\n                "partitionColumn": "RECORD",\n                "lowerBound": 1,\n                "upperBound": 50000,\n                "get_timestamp_from_act_request": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    How can we decide the partitionColumn?

    \n\n

    Compatible partitionColumn for upperBound/lowerBound Spark options:

    \n\n

    It needs to be int, date, timestamp \u2192 https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

    \n\n

    If you don't have any column to partition on those formats, you can use predicates to partition the table \u2192 https://docs.databricks.com/en/connect/external-systems/jdbc.html#manage-parallelism

    \n\n

    One of the most important parameters to optimise the extraction is the partitionColumn, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not.

    \n\n

    Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to \"create\" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column.

    \n\n
    \n\n
    Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the scenario 2.1 in the template above. It might make sense to use scenario 2.1 for the init and then scenario 2.2 or 2.3 for the subsequent deltas.
    \n\n
    \n\n

    When there is no int, date or timestamp good candidate for partitionColumn:

    \n\n

    In this case you can opt by the scenario 2.5 - Generate Predicates, which supports any kind of column to be defined as partitionColumn.

    \n\n

    However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the partitionColumn, so you can perform some analysis.

    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sftp", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sftp", "kind": "module", "doc": "

    Extract from SFTP

    \n\n

    Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.

    \n\n

    This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket.

    \n\n

    The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as dq_specs, output_specs, terminator_specs and transform_specs.

    \n\n

    Furthermore, this feature provides several filters on the directories that makes easier to control the extractions.

    \n\n

    Introductory Notes:

    \n\n

    There are important parameters that must be added to input specs in order to make the SFTP extraction work properly:

    \n\n
    \n\n
    Read type
    The engine supports only BATCH mode for this feature.
    \n\n
    \n\n

    sftp_files_format - File format that will be used to read data from SFTP. The engine supports: CSV, FWF, JSON and XML.

    \n\n

    location - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the file_name_contains option.

    \n\n

    options - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section.

    \n\n

    The options allowed are:

    \n\n\n\n\n \n \n \n \n\n\n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n
    Property typeDetailExampleComment
    Connectionadd_auto_policy(str)true of falseIndicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection.
    Connectionkey_type (str)\"Ed25519\" or \"RSA\"Indicates the key type to be used for the connection (SSH, Ed25519).
    Connectionkey_filename (str)\"/path/to/private_key/private_key.ppk\"The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use add_auto_policy.
    Connectionpkey (str)\"AAAAC3MidD1lVBI1NTE5AAAAIKssLqd6hjahPi9FBH4GPDqMqwxOMsfxTgowqDCQAeX+\"Value to use for the host key when connecting to the remote SFTP server.
    Filterdate_time_gt (str)\"1900-01-01\" or \"1900-01-01 08:59:59\"Filter the files greater than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
    Filterdate_time_lt (str)\"3999-12-31\" or \"3999-12-31 20:59:59\"Filter the files lower than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
    Filterearliest_file (bool)true or falseFilter the earliest dated file in the directory.
    Filterfile_name_contains (str)\"part_of_filename\"Filter files when match the pattern.
    Filterlatest_file (bool)true or falseFilter the most recent dated file in the directory.
    Read data from subdirectoriessub_dir (bool)true or falseThe engine will search files into subdirectories of the location. It will consider one level below the root location given.
    When sub_dir is used with latest_file/earliest_file argument, the engine will retrieve the latest/earliest file for each subdirectory.
    Add metadata infofile_metadata (bool)true or falseWhen this option is set as True, the dataframe retrieves the filename with location and the modification_time from the original files in sftp. It attaches these two columns adding the information to respective records.
    \n\n

    Useful Info & Links:

    \n\n
      \n
    1. Paramiko SSH Client
    2. \n
    3. Pandas documentation
    4. \n
    \n\n

    Scenario 1

    \n\n

    The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column \"created_on\" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option file_metadata) and the processing date from the engine.

    \n\n

    For an incremental load approach, it is advised to use the \"modification_time\" column created by the option file_metadata. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently.

    \n\n
    \n\n
    Below scenario uses \"add_auto_policy\": true, which is not recommended.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n      {\n          "spec_id": "sftp_source",\n          "read_type": "batch",\n          "data_format": "sftp",\n          "sftp_files_format": "csv",\n          "location": "my_sftp_data_path",\n          "options": {\n              "hostname": "my_sftp_hostname",\n              "username": "my_sftp_username",\n              "password": "my_sftp_password",\n              "port": "my_port",\n              "add_auto_policy": True,\n              "file_name_contains": "test_pattern",\n              "args": {"sep": "|"},\n              "latest_file": True,\n              "file_metadata": True\n          }\n      },\n  ],\n  "transform_specs": [\n      {\n          "spec_id": "sftp_transformations",\n          "input_id": "sftp_source",\n          "transformers": [\n              {\n                  "function": "with_literals",\n                  "args": {"literals": {"created_on": datetime.now()}},\n              },\n          ],\n      },\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sftp_bronze",\n      "input_id": "sftp_transformations",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "s3://my_path/dummy_table"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    Scenario 2

    \n\n

    The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys.

    \n\n

    For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes.

    \n\n
    \n\n\n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n      {\n          "spec_id": "sftp_source",\n          "read_type": "batch",\n          "data_format": "sftp",\n          "sftp_files_format": "json",\n          "location": "my_sftp_data_path",\n          "options": {\n              "hostname": "my_sftp_hostname",\n              "username": "my_sftp_username",\n              "password": "my_sftp_password",\n              "port": "my_port",\n              "key_type": "RSA",\n              "key_filename": "dbfs_mount_location/my_file_key.ppk",\n              "pkey": "my_key",\n              "latest_file": True,\n              "file_metadata": True,\n              "args": {"lines": True, "orient": "columns"},\n          },\n      },\n  ],\n  "transform_specs": [\n      {\n          "spec_id": "sftp_transformations",\n          "input_id": "sftp_source",\n          "transformers": [\n              {\n                  "function": "with_literals",\n                  "args": {"literals": {"lh_created_on": datetime.now()}},\n              },\n          ],\n      },\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sftp_bronze",\n      "input_id": "sftp_transformations",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "location": "s3://my_path/dummy_table"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_using_jdbc_connection", "modulename": "lakehouse_engine_usage.data_loader.extract_using_jdbc_connection", "kind": "module", "doc": "

    Extract using JDBC connection

    \n\n
    \n\n
    SAP Extraction
    \n\n

    SAP is only used as an example to demonstrate how we can use a JDBC connection to extract data.

    \n\n

    If you are looking to extract data from SAP, please use our sap_b4 or sap_bw reader.

    \n\n

    You can find the sap_b4 reader documentation: Extract from SAP B4 ADSOs and the sap_bw reader documentarion: Extract from SAP BW DSOs

    \n\n
    \n\n
    \n\n
    Parallel Extraction
    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    Introduction

    \n\n

    Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC.

    \n\n

    In the next section you will find several examples about how to do it.

    \n\n

    The Simplest Scenario using sqlite

    \n\n
    \n\n\n\n
    \n\n

    This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source.

    \n\n

    Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password.

    \n\n

    Same as spark, we provide two different ways to run jdbc reader.

    \n\n

    1 - We can use the jdbc() function, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options().

    \n\n

    2 - Other way is using .format(\"jdbc\") and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution.

    \n\n

    You can find and run the following code in our local test for the engine.

    \n\n

    jdbc() function

    \n\n

    As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the jdbc_args object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism.

    \n\n

    The below code is an example in how to use jdbc() function in our ACON.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "jdbc_args": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",\n        "table": "jdbc_function",\n        "properties": {\n          "driver": "org.sqlite.JDBC"\n        }\n      },\n      "options": {\n        "numPartitions": 1\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "db_table": "test_db.jdbc_function_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/correct_arguments/data"\n    }\n  ]\n}\n
    \n
    \n\n

    This is same as using the following code in pyspark:

    \n\n
    \n
    spark.read.jdbc(\n  url="jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",\n  table="jdbc_function",\n  properties={"driver":"org.sqlite.JDBC"})\n  .option("numPartitions", 1)\n
    \n
    \n\n

    .format(\"jdbc\")

    \n\n

    In this example we do not use the jdbc_args object. All the jdbc connection parameters are inside the dictionary with the object options.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "options": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db",\n        "dbtable": "jdbc_format",\n        "driver": "org.sqlite.JDBC",\n        "numPartitions": 1\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "db_table": "test_db.jdbc_format_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/correct_arguments/data"\n    }\n  ]\n}\n
    \n
    \n\n

    This is same as using the following code in pyspark:

    \n\n
    \n
    spark.read.format("jdbc")\n    .option("url", "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db")\n    .option("driver", "org.sqlite.JDBC")\n    .option("dbtable", "jdbc_format")\n    .option("numPartitions", 1)\n
    \n
    \n\n

    Template with more complete and runnable examples

    \n\n

    In this template we will use a SAP as example for a more complete and runnable example.\nThese definitions can be used in several databases that allow JDBC connection.

    \n\n

    The following scenarios of extractions are covered:

    \n\n\n\n
    \n\n

    Disclaimer: This template only uses SAP as demonstration example for JDBC connection.\nThis isn't a SAP template!!!\nIf you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.

    \n\n
    \n\n

    The JDBC connection has 2 main sections to be filled, the jdbc_args and options:

    \n\n\n\n

    If you want to know more regarding jdbc spark options you can follow the link below:

    \n\n\n\n

    If you want to have a better understanding about JDBC Spark optimizations, you can find them in the following:

    \n\n\n\n\n\n

    This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the data\nyou want to extract from is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source, we can have two options:

    \n\n\n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_jdbc_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    On this section we present 3 possible scenarios for parallel extractions from JDBC sources.

    \n\n
    \n\n

    Disclaimer for parallel extraction: Parallel extractions can bring a jdbc source down if a lot of stress\nis put on the system. Be careful when choosing the number of partitions. \nSpark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch experience around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.

    \n\n

    On the example bellow, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.

    \n\n
    Delta Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n                "numPartitions": 10,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:

    \n\n\n\n

    This is an adequate example to be followed if there is a column in the data source that is good to\nbe used as the partitionColumn. Comparing with the previous example,\nthe numPartitions and three additional options to fine tune the extraction (partitionColumn, lowerBound,\nupperBound) are provided.

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.\nExample: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like:

    \n\n\n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 2000,\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 2000,\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction with Predicates (Recommended)

    \n\n

    This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't\nnumeric, date or timestamp columns to parallelize the extraction:

    \n\n\n\n\n\n

    When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.

    \n\n

    Bellow, a lakehouse function to generate predicate list automatically, is presented.

    \n\n

    By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table (select distinct(x) from table) is being considered,\nbut it is possible to filter using predicates list here, specially if you are applying filter on\ntransformations spec, and you know entire table won't be necessary, so you can change it to something like this:\nselect distinct(x) from table where x > y.

    \n\n

    predicates_add_null: One can consider if null on predicates list or not. By default, this property is True.\nExample: for \"partitionColumn\": \"record\"

    \n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\nExecEnv.get_or_create()\n\npartitionColumn = "my_partition_col"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""\ncolumn_for_predicates = partitionColumn\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\ndriver = "com.sap.db.jdbc.Driver"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partitionColumn,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "predicates": predicates,\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\nExecEnv.get_or_create()\n\npartitionColumn = "my_partition_col"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""\ncolumn_for_predicates = partitionColumn\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\ndriver = "com.sap.db.jdbc.Driver"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partitionColumn,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "predicates": predicates,\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.filtered_full_load", "modulename": "lakehouse_engine_usage.data_loader.filtered_full_load", "kind": "module", "doc": "

    Filtered Full Load

    \n\n

    This scenario is very similar to the full load, but it filters the data coming from the source, instead of doing a complete full load.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "filtered_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "expression_filter",\n          "args": {\n            "exp": "date like '2016%'"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "filtered_sales",\n      "write_type": "overwrite",\n      "data_format": "parquet",\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.filtered_full_load_with_selective_replace", "modulename": "lakehouse_engine_usage.data_loader.filtered_full_load_with_selective_replace", "kind": "module", "doc": "

    Filtered Full Load with Selective Replace

    \n\n

    This scenario is very similar to the Filtered Full Load, but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "filtered_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "expression_filter",\n          "args": {\n            "exp": "date like '2016%'"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "filtered_sales",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "partitions": [\n        "date",\n        "customer"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data",\n      "options": {\n        "replaceWhere": "date like '2016%'"\n      }\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.flatten_schema_and_explode_columns", "modulename": "lakehouse_engine_usage.data_loader.flatten_schema_and_explode_columns", "kind": "module", "doc": "

    Flatten Schema and Explode Columns

    \n\n

    Related with schema, we can make two kind of operations:

    \n\n\n\n

    The below scenario of flatten_schema is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting max_level of 2.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "from_json",\n          "args": {\n            "input_col": "sample",\n            "schema": {\n              "type": "struct",\n              "fields": [\n                {\n                  "name": "field1",\n                  "type": "string",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field2",\n                  "type": "string",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field3",\n                  "type": "double",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field4",\n                  "type": {\n                    "type": "struct",\n                    "fields": [\n                      {\n                        "name": "field1",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      },\n                      {\n                        "name": "field2",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      }\n                    ]\n                  },\n                  "nullable": true,\n                  "metadata": {}\n                }\n              ]\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data"\n    }\n  ]\n}\n
    \n
    \n\n

    The scenario of explode_arrays is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using explode_arrays as true.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "explode_columns",\n          "args": {\n            "explode_arrays": true\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data"\n    }\n  ]\n}\n
    \n
    \n\n

    The scenario of flatten_and_explode_arrays_and_maps is using flatten_schema and explode_columns to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps:

    \n\n
    1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct;\n2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer.\n3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer.\n
    \n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "from_json",\n          "args": {\n            "input_col": "agg_fields",\n            "schema": {\n              "type": "struct",\n              "fields": [\n                {\n                  "name": "field1",\n                  "nullable": true,\n                  "metadata": {},\n                  "type": {\n                    "containsNull": true,\n                    "elementType": "string",\n                    "type": "array"\n                  }\n                },\n                {\n                  "name": "field2",\n                  "type": {\n                    "type": "struct",\n                    "fields": [\n                      {\n                        "name": "field1",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      },\n                      {\n                        "name": "field2",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      }\n                    ]\n                  },\n                  "nullable": true,\n                  "metadata": {}\n                }\n              ]\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        },\n        {\n          "function": "explode_columns",\n          "args": {\n            "explode_arrays": true,\n            "map_cols_to_explode": [\n              "sample"\n            ]\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data"\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.full_load", "modulename": "lakehouse_engine_usage.data_loader.full_load", "kind": "module", "doc": "

    Full Load

    \n\n

    This scenario reads CSV data from a path and writes in full to another path with delta lake files.

    \n\n
    Relevant notes
    \n\n\n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "repartitioned_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "repartition",\n          "args": {\n            "num_partitions": 1,\n            "cols": ["date", "customer"]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "partitions": [\n        "date",\n        "customer"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data"\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.read_from_dataframe", "modulename": "lakehouse_engine_usage.data_loader.read_from_dataframe", "kind": "module", "doc": "

    Read from Dataframe

    \n\n
    \n\n
    Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats here.
    \n\n
    \n\n

    Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows:

    \n\n
    \n
    {\n    "input_spec": {\n        "spec_id": "my_df",\n        "read_type": "batch",\n        "data_format": "dataframe",\n        "df_name": df,\n    }\n}\n
    \n
    \n\n
    \n\n
    Why is it relevant?
    \n\n

    With this capability of reading a dataframe you can deal with sources that do not yet officially have a reader (e.g., REST api, XML files, etc.).

    \n\n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_append_load_with_malformed", "modulename": "lakehouse_engine_usage.data_loader.streaming_append_load_with_malformed", "kind": "module", "doc": "

    Streaming Append Load with DROPMALFORMED

    \n\n

    This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "mode": "DROPMALFORMED"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_dropmalformed/data",\n      "schema": {\n        "type": "struct",\n        "fields": [\n          {\n            "name": "salesorder",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "item",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "date",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "customer",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "article",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "amount",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          }\n        ]\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "db_table": "test_db.streaming_dropmalformed_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/checkpoint"\n      },\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/data"\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_append_load_with_terminator", "modulename": "lakehouse_engine_usage.data_loader.streaming_append_load_with_terminator", "kind": "module", "doc": "

    Streaming Append Load with Optimize Dataset Terminator

    \n\n

    This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator here.

    \n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "mode": "DROPMALFORMED"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_with_terminators/data",\n      "schema": {\n        "type": "struct",\n        "fields": [\n          {\n            "name": "salesorder",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "item",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "date",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "customer",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "article",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "amount",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          }\n        ]\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "db_table": "test_db.streaming_with_terminators_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/checkpoint"\n      },\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/data"\n    }\n  ],\n  "terminate_specs": [\n    {\n      "function": "optimize_dataset",\n      "args": {\n        "db_table": "test_db.streaming_with_terminators_table",\n        "debug": true\n      }\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_delta_load_with_group_and_rank_condensation", "modulename": "lakehouse_engine_usage.data_loader.streaming_delta_load_with_group_and_rank_condensation", "kind": "module", "doc": "

    Streaming Delta Load with Group and Rank Condensation

    \n\n

    This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json",\n      "with_filepath": true,\n      "options": {\n        "mode": "FAILFAST",\n        "header": true,\n        "delimiter": "|"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_bronze_with_extraction_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "with_regex_value",\n          "args": {\n            "input_col": "lhe_extraction_filepath",\n            "output_col": "extraction_date",\n            "drop_input_col": true,\n            "regex": ".*WE_SO_SCL_(\\\\d+).csv"\n          }\n        },\n        {\n          "function": "with_auto_increment_id"\n        },\n        {\n          "function": "group_and_rank",\n          "args": {\n            "group_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key": [\n              "extraction_date",\n              "changed_on",\n              "lhe_row_id"\n            ]\n          }\n        },\n        {\n          "function": "repartition",\n          "args": {\n            "num_partitions": 1\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_silver",\n      "input_id": "sales_bronze_with_extraction_date",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data",\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint"\n      },\n      "with_batch_id": true,\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",\n        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",\n        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"\n      }\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_delta_with_late_arriving_and_out_of_order_events", "modulename": "lakehouse_engine_usage.data_loader.streaming_delta_with_late_arriving_and_out_of_order_events", "kind": "module", "doc": "

    Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking)

    \n\n

    How to Deal with Late Arriving Data without using Watermark

    \n\n

    This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "transformed_sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "transformed_sales_source",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data",\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint"\n      },\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ],\n  "exec_env": {\n    "spark.sql.streaming.schemaInference": true\n  }\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n\n
    \n\n
    Disclaimer! The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data.
    \n\n
    \n\n\n\n\n\n

    How to Deal with Late Arriving Data using Watermark

    \n\n

    When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result.

    \n\n

    Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.

    \n\n
    Approach 1 - Use a pre-defined fixed window (Bad)
    \n\n

    \n\n

    Credits: Image source

    \n\n

    To explain this visually let\u2019s take a scenario where we are receiving data at various times from around 10:50 AM \u2192 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period.

    \n\n

    In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM \u2192 11:10 AM window that closes at 11:10 AM, which does not give the correct result.

    \n\n
    Approach 2 - Watermark
    \n\n

    We can define a watermark that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called watermarking. In the most basic sense, by defining a watermark Spark Structured Streaming then knows when it has ingested all data up to some time, T, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp T.

    \n\n

    \n\n

    Credits: Image source

    \n\n

    Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM \u219211:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once the max event time seen minus the specified watermark is greater than the upper bound of the window.

    \n\n

    In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM \u2192 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark\u2019s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just not emitted. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM \u2192 11:00 AM window can be emitted to the result table.

    \n\n

    This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store.

    \n\n
    Watermarking and Different Output Modes
    \n\n

    It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window.

    \n\n

    Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data \u2013 higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates.

    \n\n

    Watermarks can only be used when you are running your streaming application in append or update output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state.

    \n\n
    Joins With Watermark
    \n\n

    There are three types of stream-stream joins that can be implemented in Structured Streaming: inner, outer, and semi joins. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it.

    \n\n

    To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join.

    \n\n

    Spark has a policy for handling multiple watermark definitions. Spark maintains one global watermark that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data.

    \n\n

    We can change this behaviour by changing spark.sql.streaming.multipleWatermarkPolicy to max; however, this means that data from the slower stream will be dropped.

    \n\n
    State Store Performance Considerations
    \n\n

    As of Spark 3.2, Spark offers RocksDB state store provider.

    \n\n

    If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses.

    \n\n

    In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management).

    \n\n

    To enable the new build-in state store implementation, set spark.sql.streaming.stateStore.providerClass to org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.

    \n\n

    For more details please visit Spark documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation

    \n\n

    You can enable this in your acons, by specifying it as part of the exec_env properties like below:

    \n\n
    \n
    "exec_env": {\n    "spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider"\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.write_and_read_dataframe", "modulename": "lakehouse_engine_usage.data_loader.write_and_read_dataframe", "kind": "module", "doc": "

    Write and Read Dataframe

    \n\n

    DataFrame writer can give us some advantages by returning a dictionary containing the spec_id and the computed dataframe.\nIn these examples we will cover the following scenarios of using the output dataframe format:

    \n\n
      \n
    1. Write to dataframe: Consuming the output spec as DataFrame;
    2. \n
    3. Write all dataframes: Consuming all DataFrames generated per specs;
    4. \n
    5. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data.
    6. \n
    \n\n

    Main advantages of using this output writer:

    \n\n\n\n

    If you want/need, you can add as many dataframes as you want in the output spec\nreferencing the spec_id you want to add.

    \n\n
    \n\n

    This is not intended to replace the other capabilities offered by the\nlakehouse-engine and in case other feature can cover your use case,\nyou should use it instead of using the Dataframe writer, as they\nare much more extensively tested on different type of operations.

    \n\n

    Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.

    \n\n

    Moreover, Dataframe writer is not supported for the streaming trigger\ntypes processing time and continuous.

    \n\n
    \n\n

    1. Write to dataframe: Consuming the output spec as DataFrame

    \n\n

    Silver Dummy Sales Write to DataFrame

    \n\n

    In this example we will cover the Dummy Sales write to a result containing the output DataFrame.

    \n\n\n\n
    \n\n
    If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "dataframe",\n            "options": {\n                "checkpointLocation": "s3://my_data_product_bucket/checkpoints/bronze/dummy_sales",\n            },\n        }\n    ],\n}\n
    \n
    \n\n

    Run the Load and Return the Dictionary with the DataFrames by OutputSpec

    \n\n

    This exploratory test will return a dictionary with the output spec and the dataframe\nthat will be stored after transformations.

    \n\n
    \n
    output = load_data(acon=acon)\ndisplay(output.keys())\ndisplay(output.get("dummy_sales_silver"))\n
    \n
    \n\n

    2. Write all dataframes: Consuming all DataFrames generated per specs

    \n\n

    Silver Dummy Sales Write to DataFrame

    \n\n

    In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame.

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "sales_bronze",\n            "input_id": "dummy_sales_bronze",\n            "data_format": "dataframe",\n        },\n        {\n            "spec_id": "sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "dataframe",\n        },\n    ],\n}\n
    \n
    \n\n\n\n

    This exploratory test will return a dictionary with all specs and the related dataframe.\nYou can access the DataFrame you need by output.get(<spec_id>) for future developments and tests.

    \n\n
    \n
    output = load_data(acon=acon)\ndisplay(output.keys())\ndisplay(output.get("sales_bronze"))\ndisplay(output.get("sales_silver"))\n
    \n
    \n\n

    3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data

    \n\n

    Silver Load Dummy Deliveries

    \n\n

    In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec:

    \n\n\n\n
    \n\n
    This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes.
    \n\n
    \n\n

    Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        },\n        {\n            "spec_id": "dummy_deliveries_silver_source",\n            "read_type": "batch",\n            "data_format": "delta",\n            "db_table": "my_database.dummy_deliveries",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_deliveries_table_max_value",\n            "input_id": "dummy_deliveries_silver_source",\n            "transformers": [\n                {\n                    "function": "get_max_value",\n                    "args": {"input_col": "delivery_date", "output_col": "latest"},\n                },\n                {\n                    "function": "with_expressions",\n                    "args": {\n                        "cols_and_exprs": {"latest": "CASE WHEN latest IS NULL THEN 0 ELSE latest END"},\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "deliveries_bronze",\n            "input_id": "dummy_deliveries_bronze",\n            "data_format": "dataframe",\n        },\n        {\n            "spec_id": "dummy_deliveries_transformed",\n            "input_id": "dummy_deliveries_table_max_value",\n            "data_format": "dataframe",\n        },\n    ],\n}\n\ndummy_deliveries_transformed = load_data(acon=acon)\n\ndummy_deliveries_transformed_df = dummy_deliveries_transformed.get("dummy_deliveries_transformed")\ndummy_deliveries_bronze_df = dummy_deliveries_transformed.get("deliveries_bronze")\n
    \n
    \n\n

    Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using just one output because we only need the dataframe from the output for the next step.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"delivery_note_header": "delivery_note", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_bronze_df,\n        },\n        {\n            "spec_id": "dummy_deliveries_table_max_value",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_transformed_df,\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_deliveries_transform",\n            "input_id": "dummy_deliveries_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n                {\n                    "function": "incremental_filter",\n                    "args": {\n                        "input_col": "delivery_date",\n                        "increment_df": "dummy_deliveries_table_max_value",\n                        "increment_col": "latest",\n                        "greater_or_equal": False,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "input_id": "dummy_deliveries_transform",\n            "data_format": "dataframe",\n        }\n    ],\n}\n\ndummy_deliveries_silver = load_data(acon=acon)\ndummy_deliveries_silver_df = dummy_deliveries_silver.get("dummy_deliveries_silver")\n
    \n
    \n\n

    Write the silver data generated by previous ACON into the target

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nwrite_silver_acon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_silver_df,\n        },\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dummy_deliveries_quality",\n            "input_id": "dummy_deliveries_silver",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "expectations_store_prefix": "dq/expectations/",\n            "validations_store_prefix": "dq/validations/",\n            "data_docs_prefix": "dq/data_docs/site/",\n            "checkpoint_store_prefix": "dq/checkpoints/",\n            "result_sink_db_table": "my_database.dummy_deliveries_dq",\n            "result_sink_location": "my_data_product_bucket/dq/dummy_deliveries",\n            "fail_on_error": False,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {\n                    "function": "expect_column_values_to_not_be_null",\n                    "args": {"column": "delivery_note"},\n                },\n                {\n                    "function": "expect_table_row_count_to_be_between",\n                    "args": {"min_value": 19},\n                },\n                {\n                    "function": "expect_column_max_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 2},\n                },\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "input_id": "dummy_deliveries_quality",\n            "write_type": "append",\n            "location": "s3://my_data_product_bucket/silver/dummy_deliveries_df_writer",\n            "data_format": "delta",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=write_silver_acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.write_to_console", "modulename": "lakehouse_engine_usage.data_loader.write_to_console", "kind": "module", "doc": "

    Write to Console

    \n\n

    Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected.

    \n\n

    Silver Dummy Sales Write to Console Example

    \n\n

    In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps:

    \n\n
      \n
    1. Definition of how to read data (input data location, read type and data format);
    2. \n
    3. Transformation of data (rename relevant columns);
    4. \n
    5. Definition of how to print to console (limit, truncate, vertical options);
    6. \n
    \n\n

    For this, the ACON specs are :

    \n\n\n\n
    \n\n
    Writer to console is a wrapper for spark.show() function, if you want to know more about the function itself or the available options, please check the spark documentation here.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "console",\n            "options": {"limit": 8, "truncate": False, "vertical": False},\n        }\n    ],\n}\n
    \n
    \n\n

    And then, Run the Load and Exit the Notebook: This exploratory test will write to the console, which means the final\ndataframe will be displayed.

    \n\n
    \n
    load_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality", "modulename": "lakehouse_engine_usage.data_quality", "kind": "module", "doc": "

    Data Quality

    \n\n

    The Data Quality framework is based on Great Expectations (GX) and other custom-made \ndevelopments, providing a very light abstraction on top of the GX open source framework and the Spark framework.

    \n\n

    How to use Data Quality?

    \n\n

    Data Loader

    \n\n

    You can define data quality rules inside the DataLoader algorithm that you use to load data.

    \n\n
    \n\n

    The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the \ncapability to store DQ results having history over all the DQ executions, which can be used for debugging, \nto create DQ dashboards on top of the data, and much more.

    \n\n
    \n\n

    Examples:\nIn these examples, dummy sales local data is used to cover a few example usages of the DQ Framework\n(based on Great Expectations).\nThe main difference between the sample acons is on the usage of dq_specs.

    \n\n\n\n

    Data Quality Validator

    \n\n

    The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables).\nIn contrast to the dq_specs inside the DataLoader algorithm, the DQValidator focuses on validating data at rest \n(post-mortem) instead of validating data in-transit (before it is loaded to the destination).

    \n\n
    \n\n

    The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the\ncapability to store DQ results having history over all the DQ executions, which can be used for debugging,\nto create DQ dashboards on top of the data, and much more.

    \n\n
    \n\n

    Here you can find more information regarding DQValidator and examples.

    \n\n

    Reconciliator

    \n\n

    Similarly to the Data Quality Validator algorithm, the Reconciliator algorithm focuses on \nvalidating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a \ntruth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or \nTables), instead of executing DQ rules defined by the teams. \nHere you can find more information regarding reconciliator and examples.

    \n\n
    \n\n
    Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available.
    \n\n
    \n\n

    Custom Expectations

    \n\n

    If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you \ncan create a custom expectation to make this verification.

    \n\n
    \n\n

    Before creating a custom expectation check if there is an expectation already created to address your needs, \nboth in Great Expectations and the Lakehouse Engine.\nAny Custom Expectation that is too specific (using hardcoded table/column names) will be rejected.\nExpectations should be generic by definition.

    \n\n
    \n\n

    Here you can find more information regarding custom expectations and examples.

    \n\n

    Row Tagging

    \n\n

    The row tagging strategy allows users to tag the rows that failed to be easier to identify the problems \nin the validations. Here you can find all the details and examples.

    \n\n

    How to check the results of the Data Quality Process?

    \n\n

    1. Table/location analysis

    \n\n

    The possibility to configure a Result Sink allows you to store the history of executions of the DQ process. \nYou can query the table or the location to search through data and analyse history.

    \n\n

    2. Power BI Dashboard

    \n\n

    With the information expanded, interactive analysis can be built on top of the history of the DQ process.\nA dashboard can be created with the results that we have in dq_specs. To be able to have this information you \nneed to use arguments result_sink_db_table and/or result_sink_location.

    \n\n

    Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and \nrun name, and you will have information about the number of runs, some statistics, status of expectations and more. \nAnalysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, \nand others can be made, using the information in the result_sink_db_table/result_sink_location.

    \n\n
    \n\n

    The recommendation is to use the same result sink table/location for all your dq_specs and \nin the dashboard you will get a preview of the status of all of them.

    \n\n
    \n\n

    \n\n

    3. Data Docs Website

    \n\n

    A site that is auto generated to present you all the relevant information can also be used. If you choose to define \nthe parameter data_docs_bucket you will be able to store the GX documentation in the defined bucket,\nand therefore make your data docs available in the DQ Web App (GX UI) visible to everyone. \nThe data_docs_bucket property supersedes the bucket property only for data docs storage.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.custom_expectations", "modulename": "lakehouse_engine_usage.data_quality.custom_expectations", "kind": "module", "doc": "

    Custom Expectations

    \n\n

    Defining Custom Expectations

    \n\n

    Custom expectations are defined in python and need to follow a structure to correctly integrate with Great Expectations.

    \n\n

    Follow the documentation of GX on Creating Custom Expectations \nand find information about the existing types of expectations.

    \n\n

    Here is an example of custom expectation.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    """Expectation to check if column 'a' is lower or equal than column 'b'."""\n\nfrom typing import Any, Dict, Optional\n\nfrom great_expectations.core import ExpectationConfiguration\nfrom great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine\nfrom great_expectations.expectations.expectation import ColumnPairMapExpectation\nfrom great_expectations.expectations.metrics.map_metric_provider import (\n    ColumnPairMapMetricProvider,\n    column_pair_condition_partial,\n)\n\nfrom lakehouse_engine.utils.expectations_utils import validate_result\n\n\nclass ColumnPairCustom(ColumnPairMapMetricProvider):\n    """Asserts that column 'A' is lower or equal than column 'B'.\n\n    Additionally, the 'margin' parameter can be used to add a margin to the\n    check between column 'A' and 'B': 'A' <= 'B' + 'margin'.\n    """\n\n    condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"\n    condition_domain_keys = (\n        "batch_id",\n        "table",\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n    )\n    condition_value_keys = ("margin",)\n\n    @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n    def _spark(\n        self: ColumnPairMapMetricProvider,\n        column_A: Any,\n        column_B: Any,\n        margin: Any,\n        **kwargs: dict,\n    ) -> Any:\n        """Implementation of the expectation's logic.\n\n        Args:\n            column_A: Value of the row of column_A.\n            column_B: Value of the row of column_B.\n            margin: margin value to be added to column_b.\n            kwargs: dict with additional parameters.\n\n        Returns:\n            If the condition is met.\n        """\n        if margin is None:\n            approx = 0\n        elif not isinstance(margin, (int, float, complex)):\n            raise TypeError(\n                f"margin must be one of int, float, complex."\n                f" Found: {margin} as {type(margin)}"\n            )\n        else:\n            approx = margin  # type: ignore\n\n        return column_A <= column_B + approx  # type: ignore\n\n\nclass ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n    """Expect values in column A to be lower or equal than column B.\n\n    Args:\n        column_A: The first column name.\n        column_B: The second column name.\n        margin: additional approximation to column B value.\n\n    Keyword Args:\n        - allow_cross_type_comparisons: If True, allow\n            comparisons between types (e.g. integer and string).\n            Otherwise, attempting such comparisons will raise an exception.\n        - ignore_row_if: "both_values_are_missing",\n            "either_value_is_missing", "neither" (default).\n        - result_format: Which output mode to use:\n            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n        - include_config: If True (default), then include the expectation config\n            as part of the result object.\n        - catch_exceptions: If True, then catch exceptions and\n            include them as part of the result object. Default: False.\n        - meta: A JSON-serializable dictionary (nesting allowed)\n            that will be included in the output without modification.\n\n    Returns:\n        An ExpectationSuiteValidationResult.\n    """\n\n    examples = [\n        {\n            "dataset_name": "Test Dataset",\n            "data": [\n                {\n                    "data": {\n                        "a": [11, 22, 50],\n                        "b": [10, 21, 100],\n                        "c": [9, 21, 30],\n                    },\n                    "schemas": {\n                        "spark": {\n                            "a": "IntegerType",\n                            "b": "IntegerType",\n                            "c": "IntegerType",\n                        }\n                    },\n                }\n            ],\n            "tests": [\n                {\n                    "title": "negative_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "c",\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["c"],\n                        },\n                    },\n                    "out": {\n                        "success": False,\n                        "unexpected_index_list": [\n                            {"c": 9, "a": 11},\n                            {"c": 21, "a": 22},\n                            {"c": 30, "a": 50},\n                        ],\n                    },\n                },\n                {\n                    "title": "positive_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "b",\n                        "margin": 1,\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["a"],\n                        },\n                    },\n                    "out": {\n                        "success": True,\n                        "unexpected_index_list": [],\n                    },\n                },\n            ],\n        },\n    ]\n\n    map_metric = "column_pair_values.a_smaller_or_equal_than_b"\n    success_keys = (\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n        "margin",\n        "mostly",\n    )\n    default_kwarg_values = {\n        "mostly": 1.0,\n        "ignore_row_if": "neither",\n        "result_format": "BASIC",\n        "include_config": True,\n        "catch_exceptions": False,\n    }\n\n    def _validate(\n        self,\n        configuration: ExpectationConfiguration,\n        metrics: Dict,\n        runtime_configuration: Optional[dict] = None,\n        execution_engine: Optional[ExecutionEngine] = None,\n    ) -> dict:\n        """Custom implementation of the GE _validate method.\n\n        This method is used on the tests to validate both the result\n        of the tests themselves and if the unexpected index list\n        is correctly generated.\n        The GE test logic does not do this validation, and thus\n        we need to make it manually.\n\n        Args:\n            configuration: Configuration used in the test.\n            metrics: Test result metrics.\n            runtime_configuration: Configuration used when running the expectation.\n            execution_engine: Execution Engine where the expectation was run.\n\n        Returns:\n            Dictionary with the result of the validation.\n        """\n        return validate_result(self, configuration, metrics)\n\n\n"""Mandatory block of code. If it is removed the expectation will not be available."""\nif __name__ == "__main__":\n    # test the custom expectation with the function `print_diagnostic_checklist()`\n    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
    \n
    \n\n

    Naming Conventions

    \n\n

    Your expectation's name should start with expect.

    \n\n

    The name of the file must be the name of the expectation written in snake case. Ex: expect_column_length_match_input_length

    \n\n

    The name of the class must be the name of the expectation written in camel case. Ex: ExpectColumnLengthMatchInputLength

    \n\n

    File Structure

    \n\n

    The file contains two main sections:

    \n\n\n\n

    Metric Definition

    \n\n

    In this section we define the logic of the expectation. This needs to follow a certain structure:

    \n\n

    Code Structure

    \n\n

    1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds \nto your expectation's type. More info on the metric providers.

    \n\n

    2) You need to define the name of your metric. This name must be unique and must follow the following structure: \ntype of expectation.name of metric. Ex.: column_pair_values.a_smaller_or_equal_than_b\nTypes of expectations: column_values, multicolumn_values, column_pair_values, table_rows, table_columns.

    \n\n

    3) Any GX default parameters that are necessary to calculate your metric must be defined as \"condition_domain_keys\".

    \n\n

    4) Any additional parameters that are necessary to calculate your metric must be defined as \"condition_value_keys\".

    \n\n

    5) The logic of your expectation must be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse.

    \n\n
    \n
    1) class ColumnMapMetric(ColumnMapMetricProvider):\n    """Asserts that a column matches a pattern."""\n\n    2) condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"\n    3) condition_domain_keys = (\n        "batch_id",\n        "table",\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n    )\n    4) condition_value_keys = ("margin",)\n\n    5) @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n    def _spark(\n        self: ColumnPairMapMetricProvider,\n        column_A: Any,\n        column_B: Any,\n        margin: Any,\n        **kwargs: dict,\n    ) -> Any:\n        """Implementation of the expectation's logic.\n\n        Args:\n            column_A: Value of the row of column_A.\n            column_B: Value of the row of column_B.\n            margin: margin value to be added to column_b.\n            kwargs: dict with additional parameters.\n\n        Returns:\n            If the condition is met.\n        """\n        if margin is None:\n            approx = 0\n        elif not isinstance(margin, (int, float, complex)):\n            raise TypeError(\n                f"margin must be one of int, float, complex."\n                f" Found: {margin} as {type(margin)}"\n            )\n        else:\n            approx = margin  # type: ignore\n\n        return column_A <= column_B + approx  # type: ignore\n
    \n
    \n\n

    Expectation Definition

    \n\n

    In this section we define the expectation. This needs to follow a certain structure:

    \n\n

    Code Structure

    \n\n

    1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type.

    \n\n

    2) You must define an \"examples\" object where you define at least one success and one failure of your expectation to \ndemonstrate its logic. The result format must be set to complete, and you must set the unexpected_index_name variable.

    \n\n
    \n\n

    For any examples where you will have unexpected results you must define unexpected_index_list in your \"out\" element.\nThis will be validated during the testing phase.

    \n\n
    \n\n

    3) The metric must be the same you defined in the metric definition.

    \n\n

    4) You must define all additional parameters that the user has to/should provide to the expectation.

    \n\n

    5) You should define any default values for your expectations parameters.

    \n\n

    6) You must define the _validate method like shown in the example. You must call the validate_result function \ninside your validate method, this process adds a validation to the unexpected index list in the examples.

    \n\n
    \n\n

    If your custom expectation requires any extra validations, or you require additional fields to be returned on \nthe final dataframe, you can add them in this function. \nThe validate_result method has two optional parameters (partial_success and `partial_result) that can be used to \npass the result of additional validations and add more information to the result key of the returned dict respectively.

    \n\n
    \n\n
    \n
    1) class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n    """Expect values in column A to be lower or equal than column B.\n\n    Args:\n        column_A: The first column name.\n        column_B: The second column name.\n        margin: additional approximation to column B value.\n\n    Keyword Args:\n        allow_cross_type_comparisons: If True, allow\n            comparisons between types (e.g. integer and string).\n            Otherwise, attempting such comparisons will raise an exception.\n        ignore_row_if: "both_values_are_missing",\n            "either_value_is_missing", "neither" (default).\n        result_format: Which output mode to use:\n            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n        include_config: If True (default), then include the expectation config\n            as part of the result object.\n        catch_exceptions: If True, then catch exceptions and\n            include them as part of the result object. Default: False.\n        meta: A JSON-serializable dictionary (nesting allowed)\n            that will be included in the output without modification.\n\n    Returns:\n        An ExpectationSuiteValidationResult.\n    """\n    2) examples = [\n        {\n            "dataset_name": "Test Dataset",\n            "data": {\n                "a": [11, 22, 50],\n                "b": [10, 21, 100],\n                "c": [9, 21, 30],\n            },\n            "schemas": {\n                "spark": {"a": "IntegerType", "b": "IntegerType", "c": "IntegerType"}\n            },\n            "tests": [\n                {\n                    "title": "negative_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "c",\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["c"],\n                            "include_unexpected_rows": True,\n                        },\n                    },\n                    "out": {\n                        "success": False,\n                        "unexpected_index_list": [\n                            {"c": 9, "a": 11},\n                            {"c": 21, "a": 22},\n                            {"c": 30, "a": 50},\n                        ],\n                    },\n                },\n                {\n                    "title": "positive_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "b",\n                        "margin": 1,\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["a"],\n                        },\n                    },\n                    "out": {"success": True},\n                },\n            ],\n        },\n    ]\n\n    3) map_metric = "column_values.pattern_match"\n    4) success_keys = (\n        "validation_regex",\n        "mostly",\n    )\n    5) default_kwarg_values = {\n        "ignore_row_if": "never",\n        "result_format": "BASIC",\n        "include_config": True,\n        "catch_exceptions": False,\n        "mostly": 1,\n    }\n\n    6) def _validate(\n        self,\n        configuration: ExpectationConfiguration,\n        metrics: Dict,\n        runtime_configuration: Optional[dict] = None,\n        execution_engine: Optional[ExecutionEngine] = None,\n    ) -> dict:\n        """Custom implementation of the GX _validate method.\n\n        This method is used on the tests to validate both the result\n        of the tests themselves and if the unexpected index list\n        is correctly generated.\n        The GX test logic does not do this validation, and thus\n        we need to make it manually.\n\n        Args:\n            configuration: Configuration used in the test.\n            metrics: Test result metrics.\n            runtime_configuration: Configuration used when running the expectation.\n            execution_engine: Execution Engine where the expectation was run.\n\n        Returns:\n            Dictionary with the result of the validation.\n        """\n        return validate_result(self, configuration, metrics)\n
    \n
    \n\n

    Printing the Expectation Diagnostics

    \n\n

    Your expectations must include the ability to call the Great Expectations diagnostic function in order to be validated.

    \n\n

    In order to do this code must be present.

    \n\n
    \n
    """Mandatory block of code. If it is removed the expectation will not be available."""\nif __name__ == "__main__":\n    # test the custom expectation with the function `print_diagnostic_checklist()`\n    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
    \n
    \n\n

    Creation Process

    \n\n

    1) Create a branch from lakehouse engine.

    \n\n

    2) Create a custom expectation with your specific logic:

    \n\n
      \n
    1. All new expectations must be placed inside folder /lakehouse_engine/dq_processors/custom_expectations.
    2. \n
    3. The name of the expectation must be added to the file /lakehouse_engine/core/definitions.py, to the variable: CUSTOM_EXPECTATION_LIST.
    4. \n
    5. All new expectations must be tested on /tests/feature/custom_expectations/test_custom_expectations.py.\nIn order to create a new test for your custom expectation it is necessary to:
    6. \n
    \n\n\n\n

    3) When the development is completed, create a pull request with your changes.

    \n\n

    4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. \nThis means that you need to upgrade your version of the lakehouse engine in order to use it.

    \n\n

    Usage

    \n\n

    Custom Expectations are available to use like any other expectations provided by Great Expectations.

    \n\n

    Parameters

    \n\n

    Depending on the type of expectation you are defining some parameters are expected by default. \nEx: A ColumnMapExpectation has a default \"column\" parameter.

    \n\n

    Mostly

    \n\n

    Mostly is a standard \nparameter for a subset of expectations that is used to define a threshold for the failure of an expectation. \nEx: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have \na negative result.

    \n\n

    Result Format

    \n\n

    Great Expectations has several different types of result formats \nfor the expectations results. The lakehouse engine requires the result format to be set to \"COMPLETE\" in order to tag \nthe lines where the expectations failed.

    \n\n

    unexpected_index_column_names

    \n\n

    Inside this key you must define what columns are used as an index inside your data. If this is set and the result \nformat is set to \"COMPLETE\" a list with the indexes of the lines that failed the validation will be returned by \nGreat Expectations.\nThis information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests \ninside the _validate method verify that the custom expectation is tagging these lines correctly.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.data_quality_validator", "modulename": "lakehouse_engine_usage.data_quality.data_quality_validator", "kind": "module", "doc": "

    Data Quality Validator

    \n\n

    DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations).\nWith this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process,\nusing Great Expectations functions directly into a specific dataset also\nmaking use of all the InputSpecs available in the engine.

    \n\n

    Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality.

    \n\n
    \n\n

    This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ\nprocess raises any exception. Please use it carefully!! You may lose important commits and data. Moreover, this will\nhighly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data\nQuality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored\nto the previous version, but the error could have happened 4 or 5 versions before.

    \n\n
    \n\n

    When to use?

    \n\n\n\n

    This algorithm also gives teams some freedom to:

    \n\n\n\n

    How to use?

    \n\n

    All of these configurations are passed via the ACON to instantiate\na DQValidatorSpec object. The DQValidator algorithm uses an\nACON to configure its execution. In DQValidatorSpec you can\nfind the meaning of each ACON property.

    \n\n

    Here is an example of ACON configuration:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_spec": {\n        "spec_id": "sales_source",\n        "read_type": "batch",\n        "data_format": "table",\n        "db_table": "my_database.my_table"\n    },\n    "dq_spec": {\n        "spec_id": "dq_sales",\n        "input_id": "sales_source",\n        "dq_type": "validator",\n        "store_backend": "file_system",\n        "local_fs_root_dir": "/app/tests/lakehouse/in/feature/dq_validator/dq",\n        "result_sink_db_table": "my_database.dq_validator",\n        "result_sink_format": "json",\n        "fail_on_error": False,\n        "dq_functions": [\n            {"function": "expect_column_to_exist", "args": {"column": "article"}},\n            {\n                "function": "expect_table_row_count_to_be_between",\n                "args": {"min_value": 3, "max_value": 11},\n            },\n        ],\n    },\n    "restore_prev_version": True,\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    On this page you will also find the following examples of usage:

    \n\n
      \n
    1. Dataframe as input & Success on the DQ Validation
    2. \n
    3. Table as input & Failure on DQ Validation & Restore previous version
    4. \n
    5. Files as input & Failure on DQ Validation & Fail_on_error disabled
    6. \n
    7. Files as input & Failure on DQ Validation & Critical functions defined
    8. \n
    9. Files as input & Failure on DQ Validation & Max failure percentage defined
    10. \n
    \n\n

    Example 1 : Dataframe as input & Success on the DQ Validation

    \n\n

    This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new\nDataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and\nsome DQ Validations are applied on top of this dataframe.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\ninput_df = spark.sql("""\n        SELECT a.*, b.article_category, b.article_color\n        FROM my_database.dummy_deliveries a\n        JOIN my_database.dummy_pd_article b\n            ON a.article_id = b.article_id\n        """\n)\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_article_input",\n        "read_type": "batch",\n        "data_format": "dataframe",\n        "df_name": input_df,\n    },\n    "dq_spec": {\n        "spec_id": "deliveries_article_dq",\n        "input_id": "deliveries_article_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "result_sink_db_table": "my_database.dq_validator_deliveries",\n        "result_sink_location": "my_dq_path/dq_validator/dq_validator_deliveries/",\n        "expectations_store_prefix": "dq/dq_validator/expectations/",\n        "validations_store_prefix": "dq/dq_validator/validations/",\n        "data_docs_prefix": "dq/dq_validator/data_docs/site/",\n        "checkpoint_store_prefix": "dq/dq_validator/checkpoints/",\n        "unexpected_rows_pk": ["salesorder", "delivery_item", "article_id"],\n        "dq_functions": [{"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 2: Table as input & Failure on DQ Validation & Restore previous version

    \n\n

    In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version.

    \n\n
    \n\n

    Be careful when using the feature of restoring a previous version of a delta table or delta files. You may\nlose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality\nvalidations. If you run your data loads daily and Data Quality validations weekly, and you define the\nrestore_prev_version to true, this means that the table will be restored to the previous version, but the error\ncould have happened 4 or 5 versions before (because loads are daily, validations are weekly).

    \n\n
    \n\n

    Steps followed in this example to show how the restore_prev_version feature works.

    \n\n
      \n
    1. Insert rows into the dummy_deliveries table to adjust the total numbers of rows and make the DQ process fail.
    2. \n
    3. Use the \"DESCRIBE HISTORY\" statement to check the number of versions available on the table and check the version\nnumber resulting from the insertion to the table.
    4. \n
    5. Execute the DQ Validation, using the configured acon (based on reading the dummy_deliveries table and setting the \nrestore_prev_version to true). Checking the logs of the process, you can see that the data did not pass all the \nexpectations defined and that the table version restore process was triggered.
    6. \n
    7. Re-run a \"DESCRIBE HISTORY\" statement to check that the previous version of the table was restored and thus, the row inserted in the beginning of the process is no longer present in the table.
    8. \n
    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\n# Force failure of data quality by adding new row\nspark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""")\n\n\n# Check history of the table\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "read_type": "batch",\n        "db_table": "my_database.dummy_deliveries",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 19}},\n        ],\n    },\n    "restore_prev_version": True,\n}\n\nexecute_dq_validation(acon=acon)\n\n# Check that the previous version of the table was restored\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n
    \n
    \n\n

    Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail, however disabling the \"fail_on_error\" configuration,\nso the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": False,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 4: Files as input & Failure on DQ Validation & Critical functions defined

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error\nif any of the functions fails.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": True,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n        ],\n        "critical_functions": [\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the max_percentage_failure,\nwhich will throw an error if the percentage of failures surpasses the defined maximum threshold.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": True,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n        "max_percentage_failure": 0.2,\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Limitations

    \n\n

    Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.minimal_example", "modulename": "lakehouse_engine_usage.data_quality.minimal_example", "kind": "module", "doc": "

    Minimal Example

    \n\n

    This scenario illustrates the minimal configuration that you can have to use dq_specs, in which\nit uses required parameters: spec_id, input_id, dq_type, bucket, dq_functions and the optional\nparameter data_docs_bucket. This parameter allows you to store the GX documentation in another\nbucket that can be used to make your data docs available, in DQ Web App (GX UI), without giving users access to your bucket.\nThedata_docs_bucket property supersedes the bucket property only for data docs storage.

    \n\n

    Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.result_sink", "modulename": "lakehouse_engine_usage.data_quality.result_sink", "kind": "module", "doc": "

    Result Sink

    \n\n

    These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining\nthe specific table and location (result_sink_db_table and result_sink_location) where the results\nare expected to be stored. With this configuration, people can, later on, check the history of the DQ\nexecutions using the configured table/location, as shown bellow. You can configure saving the output of the\nresults in the result sink following two approaches:

    \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n\n\n
    ...sourcecolumnmax_valuemin_valueexpectation_typeexpectation_successobserved_valuerun_time_year...
    all columns from raw + moredeliveriessalesordernullnullexpect_column_to_existTRUEnull2023...
    all columns from raw + moredeliveriesnullnullnullexpect_table_row_count_to_be_betweenTRUE232023...
    all columns from raw + moredeliveriesnullnullnullexpect_table_column_count_to_be_betweenTRUE62023...
    \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n\n\n
    checkpoint_configrun_namerun_timerun_resultssuccessvalidation_result_identifierspec_idinput_id
    entire configuration20230323-...-dq_validation2023-03-23T15:11:32.225354+00:00results of the 3 expectationstrue/false for the runidentifierspec_idinput_id
    \n\n
    \n\n
      \n
    • More configurations can be applied in the result sink, as the file format and partitions.
    • \n
    • It is recommended to:\n
        \n
      • Use the same result sink table/location for all dq_specs across different data loads, from different \nsources, in the same Data Product.
      • \n
      • Use the parameter source (only available with \"result_sink_explode\": True), in the dq_specs, as\nused in both scenarios, with the name of the data source, to be easier to distinguish sources in the\nanalysis. If not specified, the input_id of the dq_spec will be considered as the source.
      • \n
      • These recommendations will enable more rich analysis/dashboard at Data Product level, considering\nall the different sources and data loads that the Data Product is having.
      • \n
    • \n
    \n\n
    \n\n

    1. Result Sink Exploded (Recommended)

    \n\n

    This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink,\nin a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and\nby keyword arguments. This is the recommended approach since it makes the analysis on top of the result\nsink way easier and faster.

    \n\n

    For achieving the exploded data model, this scenario introduces the parameter result_sink_explode, which\nis a flag to determine if the output table/location should have the columns exploded (as True) or\nnot (as False). Default: True, but it is still provided explicitly in this scenario for demo purposes.\nThe table/location will include a schema which contains general columns, statistic columns, arguments of\nexpectations, and others, thus part of the schema will be always with values and other part will depend on\nthe expectations chosen.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "result_sink_explode": True,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_success",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    To check the history of the DQ results, you can run commands like:

    \n\n\n\n

    2. Raw Result Sink

    \n\n

    This scenario is very similar to the previous one, but it changes the parameter result_sink_explode to False so that\nit produces a raw result sink output containing only one row representing the full run of dq_specs (no\nmatter the amount of expectations/dq_functions defined there). Being a raw output, it is not a\nrecommended approach, as it will be more complicated to analyse and make queries on top of it.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink_raw",\n            "result_sink_location": "my_dq_path/dq_result_sink_raw/",\n            "result_sink_explode": False,\n            "tbl_to_derive_pk": "{{ configs.database }}.dummy_deliveries",\n            "source": "deliveries_success_raw",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    To check the history of the DQ results, you can run commands like:

    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_quality.row_tagging", "modulename": "lakehouse_engine_usage.data_quality.row_tagging", "kind": "module", "doc": "

    Row Tagging

    \n\n

    Data quality is essential for any organisation that relies on data to make informed decisions. \nHigh-quality data provides accurate, reliable, and timely information that enables organisations to identify\nopportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect\nconclusions, faulty decisions, and wasted resources.

    \n\n

    There are several common issues that can compromise data quality, such as:

    \n\n\n\n

    Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for \naccuracy and completeness is key for any organisation.

    \n\n

    One of these controls that can be applied is the DQ Row Tagging Strategy so that you not only apply validations on \nyour data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations \nproviding advantages like:

    \n\n\n\n
    \n\n

    When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning \nthat all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality\nissues it is having.

    \n\n
    \n\n

    Different Types of Expectations:

    \n\n\n\n

    The expectations highlighted as row level will be the ones enabling to Tag failures on specific rows and adding \nthe details about each failure (they affect the field run_row_result inside dq_validations). The expectations \nwith other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag\nspecific rows (they affect the field run_success only, so you can even have situations for which you get \nrun_success False and run_row_success True for all rows).

    \n\n

    How does the Strategy work?

    \n\n

    The strategy relies mostly on the 6 below arguments.

    \n\n
    \n\n

    When you specify \"tag_source_data\": True the arguments fail_on_error, gx_result_format and \nresult_sink_explode are set to the expected values.

    \n\n
    \n\n\n\n
    \n\n

    It only works if result_sink_explode is True, result_format is COMPLETE and \nfail_on_error is `False.

    \n\n
    \n\n\n\n
    \n\n

    It is mandatory to provide one of the arguments (unexpected_rows_pk or tbl_to_derive_pk) when using \ntag_source_data as True. \nWhen tag_source_data is False, this is not mandatory, but still recommended.

    \n\n
    \n\n

    \n\n
    \n\n

    The tagging strategy only works when tag_source_data is True, which automatically\nassigns the expected values for the parameters result_sink_explode (True), fail_on_error (False)\nand gx_result_format (\"COMPLETE\").

    \n\n
    \n\n
    \n\n

    For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, \nyou will also need to add the dq_validations field into your table (your DDL statements, recommended) or \nenable schema evolution.

    \n\n
    \n\n
    \n\n

    Kwargs field is a string, because it can assume different schemas for different expectations and runs. \nIt is useful to provide the complete picture of the row level failure and to allow filtering/joining with \nthe result sink table, when there is one. Some examples of kwargs bellow:

    \n\n
      \n
    • {\"column\": \"country\", \"min_value\": 1, \"max_value\": 2, \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for \nexpectations starting with expect_column_values (they always make use of \"column\", the other arguments vary).
    • \n
    • {\"column_A: \"country\", \"column_B\": \"city\", \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_column_pair (they make use of \"column_A\" and \"column_B\", the other arguments vary).
    • \n
    • {\"column_list\": [\"col1\", \"col2\", \"col3\"], \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_multicolumn (they make use of \"column_list\", the other arguments vary).\nbatch_id is common to all expectations, and it is an identifier for the batch of data being validated by\nGreat Expectations.
    • \n
    \n\n
    \n\n

    Example

    \n\n

    This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to\nidentify the problems in the validations.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket }}",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "tag_source_data": True,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_tag",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {\n                    "function": "expect_column_values_to_be_in_set",\n                    "args": {"column": "salesorder", "value_set": ["37"]},\n                },\n                {\n                    "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b",\n                    "args": {"column_A": "salesorder", "column_B": "delivery_item"},\n                },\n                {\n                    "function": "expect_multicolumn_sum_to_equal",\n                    "args": {"column_list": ["salesorder", "delivery_item"], "sum_total": 100},\n                },\n            ],\n            "critical_functions": [\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 6}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    Running bellow cell shows the new column created, named dq_validations with information about DQ validations.\ndisplay(spark.read.format(\"delta\").load(\"s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/\"))

    \n\n

    Performance and Limitations Trade-offs

    \n\n

    When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format \"Complete\" with \nUnexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all \nthe distinct values for the primary key. After getting all the failures, we are applying some needed transformations \nand joining them with the source data, so that it can be tagged by filling the \"dq_validations\" column.

    \n\n

    Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage \nyou can cache the dataframe by passing the \"cache_df\": True in your DQ Specs. In addition to this, always have in \nmind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your \ndata loads, so always balance performance vs amount of validations that you need.

    \n\n

    Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and \nreturn/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations \nmight raise exceptions.

    \n\n

    On these situations, the data load will still happen and the data will still be tagged with the Data Quality \nvalidations information, however you won't have the complete picture of the failures, so the raised_exceptions \nfield is filled as True, so that you can easily notice it and debug it.

    \n\n

    Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong \nand want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will \nnot want your consumers to be consuming a million of defective rows). However, if you still want to try to make it \npass, you can try to increase your driver and play with some spark configurations like:

    \n\n\n\n

    For debugging purposes, you can also use a different Great Expectations Result Format like \"SUMMARY\" (adding in your DQ Spec\n\"gx_result_format\": \"SUMMARY\"), so that you get only a partial list of the failures, avoiding surpassing the driver\ncapacity.

    \n\n
    \n\n

    When using a Result Format different from the default (\"COMPLETE\"), the flag \"tag_source_data\" will be \noverwritten to False, as the results of the tagging wouldn't be complete which could lead to erroneous \nconclusions from stakeholders (but you can always get the details about the result of the DQ execution in\nthe result_sink_location or result_sink_db_table that you have configured).

    \n\n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.validations_failing", "modulename": "lakehouse_engine_usage.data_quality.validations_failing", "kind": "module", "doc": "

    Validations Failing

    \n\n

    The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations.\nThe logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why.

    \n\n

    1. Fail on Error

    \n\n

    In this scenario is specified below two parameters:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_fail",\n            "fail_on_error": False,\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 20}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},\n                {"function": "expect_column_values_to_be_null", "args": {"column": "article"}},\n                {"function": "expect_column_values_to_be_unique", "args": {"column": "status"}},\n                {\n                    "function": "expect_column_min_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 1, "max_value": 15},\n                },\n                {\n                    "function": "expect_column_max_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 15, "max_value": 30},\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    If you run bellow command, you would be able to see the success column has the value false\nfor the last execution.\ndisplay(spark.table(RENDER_UTILS.render_content(\"my_database.dq_result_sink\")))

    \n\n

    2. Critical Functions

    \n\n

    In this scenario, alternative parameters to fail_on_error are used:

    \n\n\n\n

    Additionally, it can also be defined additional parameters like:

    \n\n\n\n

    You can also pair critical_functions with max_percentage_failure by defining something like\na 0.6 max percentage of failure and also defining some critical function.\nIn this case even if the threshold is respected, the list defined on critical_functions still is checked.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "source": "deliveries_critical",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n            ],\n            "critical_functions": [\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.reconciliator", "modulename": "lakehouse_engine_usage.reconciliator", "kind": "module", "doc": "

    Reconciliator

    \n\n

    Checking if data reconciles, using this algorithm, is a matter of reading the truth data and the current data.\nYou can use any input specification compatible with the lakehouse engine to read truth or current data. On top\nof that, you can pass a truth_preprocess_query and a current_preprocess_query so you can preprocess the data before\nit goes into the actual reconciliation process. The reconciliation process is focused on joining truth\nwith current by all provided columns except the ones passed as metrics.

    \n\n

    In the table below, we present how a simple reconciliation would look like:

    \n\n\n\n\n \n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n
    current_countrycurrent_counttruth_countrytruth_countabsolute_diffperc_diffyellowredrecon_type
    Sweden123Sweden12030.0250.10.2percentage
    Germany2946Sweden2946000.10.2percentage
    France2901France2901000.10.2percentage
    Belgium426Belgium42510.0020.10.2percentage
    \n\n

    The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property\nin ReconciliatorSpec object.

    \n\n

    Below there is an example of usage of reconciliator.

    \n\n
    \n
    from lakehouse_engine.engine import execute_reconciliation\n\ntruth_query = """\n  SELECT\n    shipping_city,\n    sum(sales_order_qty) as qty,\n    order_date_header\n  FROM (\n    SELECT\n      ROW_NUMBER() OVER (\n        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n        ORDER BY changed_on desc\n      ) as rank1,\n      sales_order_header,\n      sales_order_item,\n      sales_order_qty,\n      order_date_header,\n      shipping_city\n    FROM truth -- truth is a locally accessible temp view created by the lakehouse engine\n    WHERE order_date_header = '2021-10-01'\n  ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\ncurrent_query = """\n  SELECT\n    shipping_city,\n    sum(sales_order_qty) as qty,\n    order_date_header\n  FROM (\n    SELECT\n      ROW_NUMBER() OVER (\n        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n        ORDER BY changed_on desc\n      ) as rank1,\n      sales_order_header,\n      sales_order_item,\n      sales_order_qty,\n      order_date_header,\n      shipping_city\n    FROM current -- current is a locally accessible temp view created by the lakehouse engine\n    WHERE order_date_header = '2021-10-01'\n  ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\nacon = {\n    "metrics": [{"metric": "qty", "type": "percentage", "aggregation": "avg", "yellow": 0.05, "red": 0.1}],\n    "truth_input_spec": {\n        "spec_id": "truth",\n        "read_type": "batch",\n        "data_format": "csv",\n        "schema_path": "s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json",\n        "options": {\n            "delimiter": "^",\n            "dateFormat": "yyyyMMdd",\n        },\n        "location": "s3://my_data_product_bucket/bronze/orders",\n    },\n    "truth_preprocess_query": truth_query,\n    "current_input_spec": {\n        "spec_id": "current",\n        "read_type": "batch",\n        "data_format": "delta",\n        "db_table": "my_database.orders",\n    },\n    "current_preprocess_query": current_query,\n}\n\nexecute_reconciliation(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor", "modulename": "lakehouse_engine_usage.sensor", "kind": "module", "doc": "

    Sensor

    \n\n

    What is it?

    \n\n

    The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small\nsingle-node clusters to check if an upstream system or data product contains new data since the last execution of our\njob. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new\ndata, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,\nTransformation, and Loading).

    \n\n

    How do Sensor-based jobs work?

    \n\n

    \"image\"

    \n\n

    With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source\nsystem) have new data since the last successful job. We accomplish this through the approach illustrated above, which\ncan be interpreted as follows:

    \n\n
      \n
    1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
    2. \n
    3. The Sensor task may run in a very tiny single-node cluster to ensure cost\nefficiency (check sensor cost efficiency);
    4. \n
    5. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster\nto process all the ETL tasks (data processing tasks).
    6. \n
    7. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:\n
        \n
      1. (Preferred) Sense the upstream Data Product sensor control delta table;
      2. \n
      3. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table\nsensor);
      4. \n
    8. \n
    \n\n

    The Structure and Relevance of the Data Product\u2019s Sensors Control Table

    \n\n

    The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses\nto opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that\ndata product. You can refer to the below table to understand the sensor delta table structure:

    \n\n\n\n\n \n \n \n\n\n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n
    Column NameTypeDescription
    sensor_idSTRINGA unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.
    Each sensor in each job should have a different sensor_id.
    If you attempt to create 2 sensors with the same sensor_id, the engine will fail.
    assetsARRAY<STRING>A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status PROCESSED_NEW_DATA.
    statusSTRINGStatus of the sensor. Can either be:
    • ACQUIRED_NEW_DATA \u2013 when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
    • PROCESSED_NEW_DATA - when the job where the sensor is located has processed all the tasks in that job.
    status_change_timestampSTRINGTimestamp when the status has changed for the last time.
    checkpoint_locationSTRINGBase location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.
    upstream_keySTRINGUpstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).
    This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).
    upstream_valueSTRINGUpstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key.
    This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database).
    \n\n

    Note: to make use of the sensors you will need to add this table to your data product.

    \n\n

    How is it different from scheduled jobs?

    \n\n

    Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient\nthan ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new\ndata.

    \n\n

    Are sensor-based jobs cost-efficient?

    \n\n

    For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a very tiny single-node cluster, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient.\nMoreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data.

    \n\n

    Sensor Steps

    \n\n
      \n
    1. Create your sensor task for the upstream source. Examples of available sources:\n
    2. \n
    3. Setup/Execute your ETL task based in the Sensor Condition
    4. \n
    5. Update the Sensor Control table status with the Update Sensor Status
    6. \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.delta_table", "modulename": "lakehouse_engine_usage.sensor.delta_table", "kind": "module", "doc": "

    Sensor from Delta Table

    \n\n

    This shows how to create a Sensor to detect new data from a Delta Table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as sensor_new_data.
    \n\n

    If you want to view some examples of usage you can visit the delta upstream sensor table or the jdbc sensor.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data will be consumed from a delta table in streaming mode,\nso if there is any new data it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "delta",\n        "db_table": "upstream_database.source_delta_table",\n        "options": {\n            "readChangeFeed": "true", # to read changes in upstream table\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it \nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.delta_upstream_sensor_table", "modulename": "lakehouse_engine_usage.sensor.delta_upstream_sensor_table", "kind": "module", "doc": "

    Sensor from other Sensor Delta Table

    \n\n

    This shows how to create a Sensor to detect new data from another Sensor Delta Table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    It makes use of generate_sensor_query to generate the preprocess_query,\ndifferent from delta_table.

    \n\n

    Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger \nthe condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "delta",\n        "db_table": "upstream_database.lakehouse_engine_sensors",\n        "options": {\n            "readChangeFeed": "true",\n        },\n    },\n    "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"),\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.file", "modulename": "lakehouse_engine_usage.sensor.file", "kind": "module", "doc": "

    Sensor from Files

    \n\n

    This shows how to create a Sensor to detect new data from a File Location.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, \nit will automatically trigger the proceeding task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "csv",  # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"\n        "location": "s3://my_data_product_bucket/path",\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.jdbc_table", "modulename": "lakehouse_engine_usage.sensor.jdbc_table", "kind": "module", "doc": "

    Sensor from JDBC

    \n\n

    This shows how to create a Sensor to detect new data from a JDBC table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. Generic JDBC template with fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. Generic JDBC template with fail_on_empty_result=False.
    4. \n
    \n\n

    Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "batch",\n        "data_format": "jdbc",\n        "jdbc_args": {\n            "url": "JDBC_URL",\n            "table": "JDBC_DB_TABLE",\n            "properties": {\n                "user": "JDBC_USERNAME",\n                "password": "JDBC_PWD",\n                "driver": "JDBC_DRIVER",\n            },\n        },\n        "options": {\n            "compress": True,\n        },\n    },\n    "preprocess_query": generate_sensor_query(\n        sensor_id="MY_SENSOR_ID",\n        filter_exp="?upstream_key > '?upstream_value'",\n        control_db_table_name="my_database.lakehouse_engine_sensors",\n        upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",\n    ),\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.kafka", "modulename": "lakehouse_engine_usage.sensor.kafka", "kind": "module", "doc": "

    Sensor from Kafka

    \n\n

    This shows how to create a Sensor to detect new data from Kafka.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "kafka",\n        "options": {\n            "kafka.bootstrap.servers": "KAFKA_SERVER",\n            "subscribe": "KAFKA_TOPIC",\n            "startingOffsets": "earliest",\n            "kafka.security.protocol": "SSL",\n            "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION",\n            "kafka.ssl.truststore.password": "TRUSTSTORE_PWD",\n            "kafka.ssl.keystore.location": "KEYSTORE_LOCATION",\n            "kafka.ssl.keystore.password": "KEYSTORE_PWD",\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.sap_bw_b4", "modulename": "lakehouse_engine_usage.sensor.sap_bw_b4", "kind": "module", "doc": "

    Sensor from SAP

    \n\n

    This shows how to create a Sensor to detect new data from a SAP LOGCHAIN table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.\nThe Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:\ngenerate_sensor_sap_logchain_query and generate_sensor_query.

    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "batch",\n        "data_format": "jdbc",\n        "options": {\n            "compress": True,\n            "driver": "JDBC_DRIVER",\n            "url": "JDBC_URL",\n            "user": "JDBC_USERNAME",\n            "password": "JDBC_PWD",\n            "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"),\n            "query": generate_sensor_query(\n                sensor_id="MY_SENSOR_ID",\n                filter_exp="?upstream_key > '?upstream_value'",\n                control_db_table_name="my_database.lakehouse_engine_sensors",\n                upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",\n            ),\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.update_sensor_status", "modulename": "lakehouse_engine_usage.sensor.update_sensor_status", "kind": "module", "doc": "

    Update Sensor control delta table after processing the data

    \n\n

    This shows how to update the status of your Sensor after processing the new data.

    \n\n

    Here is an example on how to update the status of your sensor in the Sensors Control Table:

    \n\n
    \n
    from lakehouse_engine.engine import update_sensor_status\n\nupdate_sensor_status(\n    sensor_id="MY_SENSOR_ID",\n    control_db_table_name="my_database.lakehouse_engine_sensors",\n    status="PROCESSED_NEW_DATA",\n    assets=["MY_SENSOR_ASSETS"]\n)\n
    \n
    \n\n

    If you want to know more please visit the definition of the class here.

    \n"}]; + /** pdoc search index */const docs = [{"fullname": "lakehouse_engine", "modulename": "lakehouse_engine", "kind": "module", "doc": "

    Lakehouse engine package containing all the system subpackages.

    \n\n\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": "

    Package containing all the lakehouse engine algorithms.

    \n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "

    Module containing the Algorithm class.

    \n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "

    Class to define the behavior of every algorithm based on ACONs.

    \n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "

    Construct Algorithm instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "

    Get data quality specification object from acon.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The DQSpec and the List of DQ Functions Specs.

    \n
    \n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "

    Module to define DataLoader class.

    \n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "

    Load data using an algorithm configuration (ACON represented as dict).

    \n\n

    This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.

    \n\n

    Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.

    \n\n

    As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.

    \n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "

    Construct DataLoader algorithm instances.

    \n\n

    A data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:

    \n\n\n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "

    Read data from an input location into a distributed dataframe.

    \n\n
    Returns:
    \n\n
    \n

    An ordered dict with all the dataframes that were read.

    \n
    \n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "

    Transform (optionally) the data that was read.

    \n\n

    If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Another ordered dict with the transformed dataframes, according to the\n transformation specification.

    \n
    \n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "

    Process the data quality tasks for the data that was read and/or transformed.

    \n\n

    It supports multiple input dataframes. Although just one is advisable.

    \n\n

    It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Another ordered dict with the validated dataframes.

    \n
    \n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "

    Write the data that was read and transformed (if applicable).

    \n\n

    It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Dataframes that were written.

    \n
    \n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "

    Terminate the algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "

    Define the algorithm execution behaviour.

    \n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "

    Module to define Data Validator class.

    \n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "

    Validate data using an algorithm configuration (ACON represented as dict).

    \n\n

    This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).

    \n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "

    Construct DQValidator algorithm instances.

    \n\n

    A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.

    \n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "

    Read data from an input location into a distributed dataframe.

    \n\n
    Returns:
    \n\n
    \n

    Dataframe with data that was read.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "

    Process the data quality tasks for the data that was read.

    \n\n

    It supports a single input dataframe.

    \n\n

    It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Validated dataframe.

    \n
    \n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "

    Define the algorithm execution behaviour.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "

    Package defining all the algorithm custom exceptions.

    \n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "

    Exception for when the reconciliation process fails.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "

    Exception for when no new data is available.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "

    Exception for when a sensor with same sensor id already exists.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "

    Exception for when the restore type is not found.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.gab", "modulename": "lakehouse_engine.algorithms.gab", "kind": "module", "doc": "

    Module to define Gold Asset Builder algorithm behavior.

    \n"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB", "kind": "class", "doc": "

    Class representing the gold asset builder.

    \n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB.__init__", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB.__init__", "kind": "function", "doc": "

    Construct GAB instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.gab.GAB.execute", "modulename": "lakehouse_engine.algorithms.gab", "qualname": "GAB.execute", "kind": "function", "doc": "

    Execute the Gold Asset Builder.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "

    Module containing the Reconciliator class.

    \n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "

    Type of Reconciliation.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "

    \n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "

    \n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "

    Transformers Available for the Reconciliation Algorithm.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "

    Class to define the behavior of an algorithm that checks if data reconciles.

    \n\n

    Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.

    \n\n

    The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).

    \n\n

    All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.

    \n\n
    \n\n

    It is crucial that both the current and truth datasets have exactly the same\nstructure.

    \n\n
    \n\n
    \n\n

    You should not use 0 as yellow or red threshold, as the algorithm will verify\nif the difference between the truth and current values is bigger\nor equal than those thresholds.

    \n\n
    \n\n
    \n\n

    The reconciliation does not produce any negative values or percentages, as we\nuse the absolute value of the differences. This means that the recon result\nwill not indicate if it was the current values that were bigger or smaller\nthan the truth values, or vice versa.

    \n\n
    \n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "

    Construct Algorithm instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "

    Get the source of truth (expected result) for the reconciliation process.

    \n\n
    Returns:
    \n\n
    \n

    DataFrame containing the source of truth.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "

    Get the current results from the table that we are checking if it reconciles.

    \n\n
    Returns:
    \n\n
    \n

    DataFrame containing the current results.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "

    Reconcile the current results against the truth dataset.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "

    Module to define Sensor algorithm behavior.

    \n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "

    Class representing a sensor to check if the upstream has new data.

    \n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "

    Construct Sensor instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "

    Execute the sensor.

    \n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "

    This module receives a config file which is included in the wheel.

    \n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "

    Package with the core behaviour of the lakehouse engine.

    \n"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager", "modulename": "lakehouse_engine.core.dbfs_file_manager", "kind": "module", "doc": "

    File manager module using dbfs.

    \n"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager", "kind": "class", "doc": "

    Set of actions to manipulate dbfs files in several ways.

    \n", "bases": "lakehouse_engine.core.file_manager.FileManager"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.get_function", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.get_function", "kind": "function", "doc": "

    Get a specific function to execute.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.delete_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.delete_objects", "kind": "function", "doc": "

    Delete objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.copy_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.copy_objects", "kind": "function", "doc": "

    Copies objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.dbfs_file_manager.DBFSFileManager.move_objects", "modulename": "lakehouse_engine.core.dbfs_file_manager", "qualname": "DBFSFileManager.move_objects", "kind": "function", "doc": "

    Moves objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "

    Definitions of standard values and structures for core components.

    \n"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage", "kind": "class", "doc": "

    Options for collecting engine usage stats.

    \n\n\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.ENABLED", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.ENABLED", "kind": "variable", "doc": "

    \n", "default_value": "<CollectEngineUsage.ENABLED: 'enabled'>"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.PROD_ONLY", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.PROD_ONLY", "kind": "variable", "doc": "

    \n", "default_value": "<CollectEngineUsage.PROD_ONLY: 'prod_only'>"}, {"fullname": "lakehouse_engine.core.definitions.CollectEngineUsage.DISABLED", "modulename": "lakehouse_engine.core.definitions", "qualname": "CollectEngineUsage.DISABLED", "kind": "variable", "doc": "

    \n", "default_value": "<CollectEngineUsage.DISABLED: 'disabled'>"}, {"fullname": "lakehouse_engine.core.definitions.EngineConfig", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineConfig", "kind": "class", "doc": "

    Definitions that can come from the Engine Config file.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.EngineConfig.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdq_bucket: Optional[str] = None,\tnotif_disallowed_email_servers: Optional[list] = None,\tengine_usage_path: Optional[str] = None,\tengine_dev_usage_path: Optional[str] = None,\tcollect_engine_usage: str = 'enabled')"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats", "kind": "class", "doc": "

    Definitions for collection of Lakehouse Engine Stats.

    \n\n
    \n\n

    Note: whenever the value comes from a key inside a Spark Config\nthat returns an array, it can be specified with a '#' so that it\nis adequately processed.

    \n\n
    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats.CLUSTER_USAGE_TAGS", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats.CLUSTER_USAGE_TAGS", "kind": "variable", "doc": "

    \n", "default_value": "<EngineStats.CLUSTER_USAGE_TAGS: 'spark.databricks.clusterUsageTags'>"}, {"fullname": "lakehouse_engine.core.definitions.EngineStats.DEF_SPARK_CONFS", "modulename": "lakehouse_engine.core.definitions", "qualname": "EngineStats.DEF_SPARK_CONFS", "kind": "variable", "doc": "

    \n", "default_value": "<EngineStats.DEF_SPARK_CONFS: {'dp_name': 'spark.databricks.clusterUsageTags.clusterAllTags#accountName', 'environment': 'spark.databricks.clusterUsageTags.clusterAllTags#environment', 'workspace_id': 'spark.databricks.clusterUsageTags.orgId', 'job_id': 'spark.databricks.clusterUsageTags.clusterAllTags#JobId', 'job_name': 'spark.databricks.clusterUsageTags.clusterAllTags#RunName', 'run_id': 'spark.databricks.clusterUsageTags.clusterAllTags#ClusterName'}>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "

    Formats of algorithm input.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "

    \n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "

    Generates a list containing all enum values.

    \n\n
    Return:
    \n\n
    \n

    A list with all enum values.

    \n
    \n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "

    Checks if the input format exists in the enum values.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    If the input format exists in our enum.

    \n
    \n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "

    Formats of algorithm output.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "

    \n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "

    Type of notifier available.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "

    \n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "

    Parameters to be replaced in runtime.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "

    \n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "

    \n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "

    Define the types of read operations.

    \n\n\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "

    \n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "

    \n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "

    Different modes that control how we handle compliance to the provided schema.

    \n\n

    These read modes map to Spark's read modes at the moment.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "

    \n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "

    \n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "

    \n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "

    Defaults used on the data quality process.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "

    \n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "

    Types of write operations.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "

    \n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "

    Specification of an algorithm input.

    \n\n

    This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\tdisable_dbfs_retry: bool = False,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "

    Transformer Specification, i.e., a single transformation amongst many.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "

    Transformation Specification.

    \n\n

    I.e., the specification that defines the many transformations to be done to the data\nthat was read.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "

    Available data quality tasks.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "

    \n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "

    Defines a data quality function specification.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "

    Data quality overall specification.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tdata_docs_local_fs: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": "

    Options for a merge operation.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "

    Specification of an algorithm output.

    \n\n

    This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "

    Terminator Specification.

    \n\n

    I.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "

    Reconciliator Specification.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "

    Data Quality Validator Specification.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "

    SQL definitions statements.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "

    \n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "

    File Manager s3 api keys.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "

    \n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "

    \n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "

    \n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "

    \n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "

    \n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "

    Sensor Specification.

    \n\n\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "

    Create SensorSpec from acon.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "

    Status for a sensor.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "

    \n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "

    \n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "

    Defaults used on consuming data from SAP Logchain.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "

    \n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "

    \n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "

    \n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "

    Archive types.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "

    Generates a list containing all enum values.

    \n\n
    Return:
    \n\n
    \n

    A list with all enum values.

    \n
    \n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "

    Checks if the restore type exists in the enum values.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    If the restore type exists in our enum.

    \n
    \n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "

    Archive types.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "

    \n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser", "kind": "class", "doc": "

    Defaults to use for parsing.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.DOUBLE_QUOTES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.DOUBLE_QUOTES", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.DOUBLE_QUOTES: '"'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SINGLE_QUOTES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SINGLE_QUOTES", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.SINGLE_QUOTES: "'">"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.BACKSLASH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.BACKSLASH", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.BACKSLASH: '\\\\'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SINGLE_TRACE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SINGLE_TRACE", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.SINGLE_TRACE: '-'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.DOUBLE_TRACES", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.DOUBLE_TRACES", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.DOUBLE_TRACES: '--'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.SLASH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.SLASH", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.SLASH: '/'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.OPENING_MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.OPENING_MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.OPENING_MULTIPLE_LINE_COMMENT: '/*'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.CLOSING_MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.CLOSING_MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.CLOSING_MULTIPLE_LINE_COMMENT: '*/'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.PARAGRAPH", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.PARAGRAPH", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.PARAGRAPH: '\\n'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.STAR", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.STAR", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.STAR: '*'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLParser.MULTIPLE_LINE_COMMENT", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLParser.MULTIPLE_LINE_COMMENT", "kind": "variable", "doc": "

    \n", "default_value": "<SQLParser.MULTIPLE_LINE_COMMENT: ['/*', '*/']>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults", "kind": "class", "doc": "

    Defaults used on the GAB process.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DATE_FORMAT", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DATE_FORMAT", "kind": "variable", "doc": "

    \n", "default_value": "<GABDefaults.DATE_FORMAT: '%Y-%m-%d'>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DIMENSIONS_DEFAULT_COLUMNS", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DIMENSIONS_DEFAULT_COLUMNS", "kind": "variable", "doc": "

    \n", "default_value": "<GABDefaults.DIMENSIONS_DEFAULT_COLUMNS: ['from_date', 'to_date']>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE", "kind": "variable", "doc": "

    \n", "default_value": "<GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE: 'dim_calendar'>"}, {"fullname": "lakehouse_engine.core.definitions.GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE", "kind": "variable", "doc": "

    \n", "default_value": "<GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE: 'lkp_query_builder'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek", "kind": "class", "doc": "

    Representation of start of week values on GAB.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.SUNDAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.SUNDAY", "kind": "variable", "doc": "

    \n", "default_value": "<GABStartOfWeek.SUNDAY: 'S'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.MONDAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.MONDAY", "kind": "variable", "doc": "

    \n", "default_value": "<GABStartOfWeek.MONDAY: 'M'>"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.get_start_of_week", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.get_start_of_week", "kind": "function", "doc": "

    Get the start of week enum as a dict.

    \n\n
    Returns:
    \n\n
    \n

    dict containing all enum entries as {name:value}.

    \n
    \n", "signature": "(cls) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABStartOfWeek.get_values", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABStartOfWeek.get_values", "kind": "function", "doc": "

    Get the start of week enum values as set.

    \n\n
    Returns:
    \n\n
    \n

    set containing all possible values {value}.

    \n
    \n", "signature": "(cls) -> set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec", "kind": "class", "doc": "

    Gab Specification.

    \n\n

    query_label_filter: query use-case label to execute.\nqueue_filter: queue to execute the job.\ncadence_filter: selected cadences to build the asset.\ntarget_database: target database to write.\ncurr_date: current date.\nstart_date: period start date.\nend_date: period end date.\nrerun_flag: rerun flag.\ntarget_table: target table to write.\nsource_database: source database.\ngab_base_path: base path to read the use cases.\nlookup_table: gab configuration table.\ncalendar_table: gab calendar table.

    \n"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_label_filter: list[str],\tqueue_filter: list[str],\tcadence_filter: list[str],\ttarget_database: str,\tcurrent_date: datetime.datetime,\tstart_date: datetime.datetime,\tend_date: datetime.datetime,\trerun_flag: str,\ttarget_table: str,\tsource_database: str,\tgab_base_path: str,\tlookup_table: str,\tcalendar_table: str)"}, {"fullname": "lakehouse_engine.core.definitions.GABSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABSpec.create_from_acon", "kind": "function", "doc": "

    Create GabSpec from acon.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence", "kind": "class", "doc": "

    Representation of the supported cadences on GAB.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.DAY", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.DAY", "kind": "variable", "doc": "

    \n", "default_value": "<GABCadence.DAY: 1>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.WEEK", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.WEEK", "kind": "variable", "doc": "

    \n", "default_value": "<GABCadence.WEEK: 2>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.MONTH", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.MONTH", "kind": "variable", "doc": "

    \n", "default_value": "<GABCadence.MONTH: 3>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.QUARTER", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.QUARTER", "kind": "variable", "doc": "

    \n", "default_value": "<GABCadence.QUARTER: 4>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.YEAR", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.YEAR", "kind": "variable", "doc": "

    \n", "default_value": "<GABCadence.YEAR: 5>"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.get_ordered_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.get_ordered_cadences", "kind": "function", "doc": "

    Get the cadences ordered by the value.

    \n\n
    Returns:
    \n\n
    \n

    dict containing ordered cadences as {name:value}.

    \n
    \n", "signature": "(cls) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.get_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.get_cadences", "kind": "function", "doc": "

    Get the cadences values as set.

    \n\n
    Returns:
    \n\n
    \n

    set containing all possible cadence values as {value}.

    \n
    \n", "signature": "(cls) -> set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABCadence.order_cadences", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCadence.order_cadences", "kind": "function", "doc": "

    Order a list of cadences by value.

    \n\n
    Returns:
    \n\n
    \n

    ordered set containing the received cadences.

    \n
    \n", "signature": "(cls, cadences_to_order: list[str]) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.GABKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABKeys", "kind": "class", "doc": "

    Constants used to update pre-configured gab dict key.

    \n"}, {"fullname": "lakehouse_engine.core.definitions.GABReplaceableKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABReplaceableKeys", "kind": "class", "doc": "

    Constants used to replace pre-configured gab dict values.

    \n"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration", "kind": "class", "doc": "

    GAB combined configuration.

    \n\n

    Based on the use case configuration return the values to override in the SQL file.\nThis enum aims to exhaustively map each combination of cadence, reconciliation,\n week_start and snap_flag return the corresponding values join_select,\n project_start and project_end to replace this values in the stages SQL file.

    \n\n

    Return corresponding configuration (join_select, project_start, project_end) for\n each combination (cadence x recon x week_start x snap_flag).

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration.COMBINED_CONFIGURATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration.COMBINED_CONFIGURATION", "kind": "variable", "doc": "

    \n", "default_value": "<GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n calendar_date as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'YEAR', 'DAY', 'MONTH', 'QUARTER'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n case\\n when '${config_week_start}' = 'Monday' then weekend_mon\\n when '${config_week_start}' = 'Sunday' then weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct month_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_mon\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_sun\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n month_end as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 13: {'cadence': 'QUARTER', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\\n )\\n )-1\\n "}, 17: {'cadence': 'YEAR', 'recon': {'DAY', 'MONTH', 'QUARTER'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when '${rec_cadence}' = 'DAY' then calendar_date\\n when '${rec_cadence}' = 'MONTH' then month_end\\n when '${rec_cadence}' = 'QUARTER' then quarter_end\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'recon': {'MONTH', 'QUARTER', 'WEEK', 'DAY', 'YEAR'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekstart_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekstart_sun\\n else\\n date(date_trunc('${cad}',calendar_date))\\n end as cadence_start_date,\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekend_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekend_sun\\n when '${cad}' = 'DAY'\\n then date(date_trunc('${cad}',calendar_date))\\n when '${cad}' = 'MONTH'\\n then date(\\n date_trunc(\\n 'MONTH',\\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\\n )\\n )-1\\n when '${cad}' = 'QUARTER'\\n then date(\\n date_trunc(\\n 'QUARTER',\\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\\n )\\n )-1\\n when '${cad}' = 'YEAR'\\n then date(\\n date_trunc(\\n 'YEAR',\\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\\n )\\n )-1\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "

    Module to take care of creating a singleton of the execution environment class.

    \n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "

    Represents the basic resources regarding the engine execution environment.

    \n\n

    Currently, it is used to encapsulate both the logic to get the Spark\nsession and the engine configurations.

    \n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.set_default_engine_config", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.set_default_engine_config", "kind": "function", "doc": "

    Set default engine configurations by reading them from a specified package.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, package: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "

    Get or create an execution environment session (currently Spark).

    \n\n

    It instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "

    Module representing an executable lakehouse engine component.

    \n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "

    Abstract class defining the behaviour of an executable component.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "

    Define the executable component behaviour.

    \n\n

    E.g., the behaviour of an algorithm inheriting from this.

    \n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "

    Module for abstract representation of a file manager system.

    \n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "

    Abstract file manager class.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "

    Construct FileManager algorithm instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "

    Delete objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "

    Copies objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.move_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.move_objects", "kind": "function", "doc": "

    Moves objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManagerFactory", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManagerFactory", "kind": "class", "doc": "

    Class for file manager factory.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.file_manager.FileManagerFactory.execute_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManagerFactory.execute_function", "kind": "function", "doc": "

    Get a specific File Manager and function to execute.

    \n", "signature": "(configs: dict) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager", "modulename": "lakehouse_engine.core.gab_manager", "kind": "module", "doc": "

    Module to define GAB Manager classes.

    \n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager", "kind": "class", "doc": "

    Class to control the GAB Cadence Window.

    \n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager.extended_window_calculator", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager.extended_window_calculator", "kind": "function", "doc": "

    extended_window_calculator function.

    \n\n

    Calculates the extended window of any cadence despite the user providing\ncustom dates which are not the exact start and end dates of a cadence.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tself,\tcadence: str,\treconciliation_cadence: str,\tcurrent_date: datetime.datetime,\tstart_date_str: str,\tend_date_str: str,\tquery_type: str,\trerun_flag: str,\tsnapshot_flag: str) -> tuple[datetime.datetime, datetime.datetime, datetime.datetime, datetime.datetime]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager.GABCadenceManager.get_cadence_start_end_dates", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABCadenceManager.get_cadence_start_end_dates", "kind": "function", "doc": "

    Generate the new set of extended start and end dates based on the cadence.

    \n\n

    Running week cadence again to extend to correct week start and end date in case\n of recon window for Week cadence is present.\nFor end_date 2012-12-31,in case of Quarter Recon window present for Week\n cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31.\nBut these are not start and end dates of week. Hence, to correct this, new dates\n are passed again to get the correct dates.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tself,\tcadence: str,\tderived_cadence: str,\tstart_date: datetime.datetime,\tend_date: datetime.datetime,\tquery_type: str,\tcurrent_date: datetime.datetime) -> tuple[datetime.datetime, datetime.datetime]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager", "kind": "class", "doc": "

    Class to control the GAB View creation.

    \n"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager.__init__", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager.__init__", "kind": "function", "doc": "

    Construct GABViewManager instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tquery_id: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_manager.GABViewManager.generate_use_case_views", "modulename": "lakehouse_engine.core.gab_manager", "qualname": "GABViewManager.generate_use_case_views", "kind": "function", "doc": "

    Generate all the use case views.

    \n\n

    Generates the DDLs for each of the views. This DDL is dynamically built based on\nthe mappings provided in the config table.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator", "modulename": "lakehouse_engine.core.gab_sql_generator", "kind": "module", "doc": "

    Module to define GAB SQL classes.

    \n"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABSQLGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABSQLGenerator", "kind": "class", "doc": "

    Abstract class defining the behaviour of a GAB SQL Generator.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABSQLGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABSQLGenerator.generate_sql", "kind": "function", "doc": "

    Define the generate sql command.

    \n\n

    E.g., the behaviour of gab generate sql inheriting from this.

    \n", "signature": "(self) -> Optional[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator", "kind": "class", "doc": "

    GAB insert generator.

    \n\n

    Creates the insert statement based on the dimensions and metrics provided in\nthe configuration table.

    \n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator.__init__", "kind": "function", "doc": "

    Construct GABInsertGenerator instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tquery_id: str,\tcadence: str,\tfinal_stage_table: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABInsertGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABInsertGenerator.generate_sql", "kind": "function", "doc": "

    Generate insert sql statement to the insights table.

    \n", "signature": "(self) -> Optional[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator", "kind": "class", "doc": "

    GAB view generator.

    \n\n

    Creates the use case view statement to be consumed.

    \n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator.__init__", "kind": "function", "doc": "

    Construct GABViewGenerator instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcadence_snapshot_status: dict,\ttarget_database: str,\tview_name: str,\tfinal_cols: str,\ttarget_table: str,\tdimensions_and_metrics_with_alias: str,\tdimensions: str,\tdimensions_and_metrics: str,\tfinal_calculated_script: str,\tquery_id: str,\tview_filter: str,\tfinal_calculated_script_snapshot: str,\twithout_snapshot_cadences: list[str],\twith_snapshot_cadences: list[str])"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABViewGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABViewGenerator.generate_sql", "kind": "function", "doc": "

    Generate use case view sql statement.

    \n", "signature": "(*args: Any) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator", "kind": "class", "doc": "

    GAB delete generator.

    \n\n

    Creates the delete statement to clean the use case base data on the insights table.

    \n", "bases": "GABSQLGenerator"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator.__init__", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator.__init__", "kind": "function", "doc": "

    Construct GABViewGenerator instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tquery_id: str,\tcadence: str,\ttemp_stage_view_name: str,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\ttarget_database: str,\ttarget_table: str)"}, {"fullname": "lakehouse_engine.core.gab_sql_generator.GABDeleteGenerator.generate_sql", "modulename": "lakehouse_engine.core.gab_sql_generator", "qualname": "GABDeleteGenerator.generate_sql", "kind": "function", "doc": "

    Generate delete sql statement.

    \n\n

    This statement is to clean the insights table for the corresponding use case.

    \n", "signature": "(*args: Any) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager", "modulename": "lakehouse_engine.core.s3_file_manager", "kind": "module", "doc": "

    File manager module using boto3.

    \n"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager", "kind": "class", "doc": "

    Set of actions to manipulate s3 files in several ways.

    \n", "bases": "lakehouse_engine.core.file_manager.FileManager"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.get_function", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.get_function", "kind": "function", "doc": "

    Get a specific function to execute.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.delete_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.delete_objects", "kind": "function", "doc": "

    Delete objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.copy_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.copy_objects", "kind": "function", "doc": "

    Copies objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be copied based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.move_objects", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.move_objects", "kind": "function", "doc": "

    Moves objects and 'directories'.

    \n\n

    If dry_run is set to True the function will print a dict with all the\npaths that would be moved based on the given keys.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.request_restore", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.request_restore", "kind": "function", "doc": "

    Request the restore of archived data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.check_restore_status", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.check_restore_status", "kind": "function", "doc": "

    Check the restore status of archived data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.S3FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "S3FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "

    Request and wait for the restore to complete, polling the restore status.

    \n\n

    After the restore is done, copy the restored files to destination

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "

    Set of actions to restore archives.

    \n"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "

    Check the restore status of archived data.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A dict containing the amount of objects in each status.

    \n
    \n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "

    Request the restore of archived data.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.s3_file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.s3_file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "

    Request and wait for the restore to complete, polling the restore status.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "

    Module to define Sensor Manager classes.

    \n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "

    Class to control the Sensor execution.

    \n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "

    Check if sensor has acquired new data.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    True if acquired new data, otherwise False

    \n
    \n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "

    Control sensor execution storing the execution data in a delta table.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "

    Read data from delta table containing sensor status info.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    Row containing the data for the provided sensor_id.

    \n
    \n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "

    Class to deal with Sensor Upstream data.

    \n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "

    Generates a sensor preprocess query based on timestamp logic.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    The query string.

    \n
    \n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "

    Generates a query to be used for a sensor having other sensor as upstream.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    The query string.

    \n
    \n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "

    Read new data from the upstream into the sensor 'new_data_df'.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    An empty dataframe if it doesn't have new data otherwise the new data

    \n
    \n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "

    Get new data from upstream df if it's present.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    Optional row, present if there is new data in the upstream,\n absent otherwise.

    \n
    \n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "

    Generates a sensor query based in the SAP Logchain table.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    The query string.

    \n
    \n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "

    Table manager module.

    \n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "

    Set of actions to manipulate tables/views in several ways.

    \n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "

    Construct TableManager algorithm instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "

    Get a specific function to execute.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "

    Create a new table or view on metastore.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "

    Create multiple tables or views on metastore.

    \n\n

    In this function the path to the ddl files can be separated by comma.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "

    Compute table statistics.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "

    Delete table function deletes table from metastore and erases all data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "

    Delete view function deletes view from metastore and erases all data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "

    Truncate function erases all data but keeps metadata.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "

    Vacuum function erases older versions from Delta Lake tables or locations.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "

    Describe function describes metadata from some table or view.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "

    Optimize function optimizes the layout of Delta Lake data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "

    Execute multiple statements in multiple sql files.

    \n\n

    In this function the path to the files is separated by comma.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "

    Execute sql commands separated by semicolon (;).

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "

    Show Table Properties.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe with the table properties.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "

    Get the primary key of a particular table.

    \n\n
    Returns:
    \n\n
    \n

    The list of columns that are part of the primary key.

    \n
    \n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "

    Run the repair table command.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "

    Run the delete where command.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "

    Package to define data quality processes available in the lakehouse engine.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "

    Package containing custom DQ expectations available in the lakehouse engine.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "

    Expectation to check if column 'a' is lower or equal than column 'b'.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "

    Asserts that column 'A' is lower or equal than column 'B'.

    \n\n

    Additionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.

    \n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "

    Expect values in column A to be lower or equal than column B.

    \n\n
    Arguments:
    \n\n\n\n
    Keyword Args:
    \n\n
    \n
      \n
    • allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
    • \n
    • ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).
    • \n
    • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.
    • \n
    • include_config: If True (default), then include the expectation config\n as part of the result object.
    • \n
    • catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.
    • \n
    • meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.
    • \n
    \n
    \n\n
    Returns:
    \n\n
    \n

    An ExpectationSuiteValidationResult.

    \n
    \n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "

    Expectation to check if column value is a date within a timeframe.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "

    Asserts that column values are a date that isn't older than a given date.

    \n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "

    Expect value in column to be date that is not older than a given time.

    \n\n

    Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.

    \n\n
    Arguments:
    \n\n\n\n
    Keyword Args:
    \n\n
    \n
      \n
    • allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
    • \n
    • ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).
    • \n
    • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.
    • \n
    • include_config: If True (default), then include the expectation config\n as part of the result object.
    • \n
    • catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.
    • \n
    • meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.
    • \n
    \n
    \n\n
    Returns:
    \n\n
    \n

    An ExpectationSuiteValidationResult.

    \n
    \n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "

    Expectation to check if column 'a' equals 'b', or 'c'.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "

    Expectation metric definition.

    \n\n

    This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.

    \n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "

    Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'.

    \n\n
    Arguments:
    \n\n\n\n
    Keyword Args:
    \n\n
    \n
      \n
    • ignore_row_if: default to \"never\".
    • \n
    • result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC, COMPLETE, or SUMMARY.\n Default set to BASIC.
    • \n
    • include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.
    • \n
    • catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.
    • \n
    \n
    \n\n
    Returns:
    \n\n
    \n

    An ExpectationSuiteValidationResult.

    \n
    \n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "

    Expectation to check if aggregated column satisfy the condition.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "

    Expect agg of column to satisfy the condition specified.

    \n\n
    Arguments:
    \n\n\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "

    Validates that a configuration has been set.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    None. Raises InvalidExpectationConfigurationError

    \n
    \n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "

    Module containing the class definition of the Data Quality Factory.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "

    Class for the Data Quality Factory.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "

    Run the specified data quality process on a dataframe.

    \n\n

    Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The DataFrame containing the results of the DQ process.

    \n
    \n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.build_data_docs", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.build_data_docs", "kind": "function", "doc": "

    Build Data Docs for the project.

    \n\n

    This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tstore_backend: str = 's3',\tlocal_fs_root_dir: str = None,\tdata_docs_local_fs: str = None,\tdata_docs_prefix: str = 'dq/data_docs/site/',\tbucket: str = None,\tdata_docs_bucket: str = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/') -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "

    Package defining all the DQ custom exceptions.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "

    Exception for when the data quality validations fail.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "

    Exception for when the checkpoint results parsing fail.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "

    Module containing the definition of a data quality validator.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "

    Class containing the data quality validator.

    \n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "

    Get a validator according to the specification.

    \n\n

    We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The validator with the expectation suite stored.

    \n
    \n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "

    Tags the source dataframe with a new column having the DQ results.

    \n\n
    Arguments:
    \n\n\n\n

    Returns: a dataframe tagged with the DQ results.

    \n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "

    Contract of the lakehouse engine with all the available functions to be executed.

    \n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "

    Load data using the DataLoader algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "

    Execute the Reconciliator algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "

    Execute the DQValidator algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "

    Manipulate tables/views using Table Manager algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "

    Manipulate s3 files using File Manager algorithm.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "

    Execute a sensor based on a Sensor Algorithm Configuration.

    \n\n

    A sensor is useful to check if an upstream system has new data.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "

    Update internal sensor status.

    \n\n

    Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "

    Generates a preprocess query to be used in a sensor configuration.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    The query string.

    \n
    \n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "

    Generates a sensor query based in the SAP Logchain table.

    \n\n
    Arguments:
    \n\n\n\n
    Return:
    \n\n
    \n

    The query string.

    \n
    \n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "

    Send a notification using a notifier.

    \n\n
    Arguments:
    \n\n\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.build_data_docs", "modulename": "lakehouse_engine.engine", "qualname": "build_data_docs", "kind": "function", "doc": "

    Build Data Docs for the project.

    \n\n

    This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tstore_backend: str = 's3',\tlocal_fs_root_dir: str = None,\tdata_docs_local_fs: str = None,\tdata_docs_prefix: str = 'dq/data_docs/site/',\tbucket: str = None,\tdata_docs_bucket: str = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/') -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_gab", "modulename": "lakehouse_engine.engine", "qualname": "execute_gab", "kind": "function", "doc": "

    Execute the gold asset builder based on a GAB Algorithm Configuration.

    \n\n

    GaB is useful to build your gold assets with predefined functions for recurrent\nperiods.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tcollect_engine_usage: str = 'prod_only',\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "

    Input and Output package responsible for the behaviour of reading and writing.

    \n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "

    Package defining all the io custom exceptions.

    \n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "

    Exception for when the input of an incremental filter is not found.

    \n\n

    This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "

    Exception for when a user provides a wrong I/O format.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "

    Exception for when a user provides a not supported operation.

    \n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "

    Defines abstract reader behaviour.

    \n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "

    Abstract Reader class.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "

    Construct Reader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "

    Abstract read method.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe read according to the input specification.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "

    Module for reader factory.

    \n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "

    Class for reader factory.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "

    Get data according to the input specification following a factory pattern.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data.

    \n
    \n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "

    Readers package to define reading behaviour.

    \n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "

    Module to define behaviour to read from dataframes.

    \n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "

    Class to read data from a dataframe.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "

    Construct DataFrameReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "

    Read data from a dataframe.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from a dataframe previously\n computed.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "

    Module to define behaviour to read from files.

    \n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "

    Class to read from files.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "

    Construct FileReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "

    Read file data.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the files.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "

    Module to define behaviour to read from JDBC sources.

    \n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "

    Class to read from JDBC source.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "

    Construct JDBCReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "

    Read data from JDBC source.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the JDBC source.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "

    Module to define behaviour to read from Kafka.

    \n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "

    Class to read from Kafka.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "

    Construct KafkaReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "

    Read Kafka data.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from Kafka.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "

    Module to define behaviour to read from a query.

    \n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "

    Class to read data from a query.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "

    Construct QueryReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "

    Read data from a query.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the query.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "

    Module to define behaviour to read from SAP B4 sources.

    \n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "

    Class to read from SAP B4 source.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "

    Construct SAPB4Reader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "

    Read data from SAP B4 source.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the SAP B4 source.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "

    Module to define behaviour to read from SAP BW sources.

    \n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "

    Class to read from SAP BW source.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "

    Construct SAPBWReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "

    Read data from SAP BW source.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the SAP BW source.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "

    Module to define behaviour to read from SFTP.

    \n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "

    Class to read from SFTP.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "

    Construct SFTPReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "

    Read SFTP data.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from SFTP.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "

    Module to define behaviour to read from tables.

    \n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "

    Class to read data from a table.

    \n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "

    Construct TableReader instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "

    Read data from a table.

    \n\n
    Returns:
    \n\n
    \n

    A dataframe containing the data from the table.

    \n
    \n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "

    Defines abstract writer behaviour.

    \n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "

    Abstract Writer class.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "

    Construct Writer instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "

    Abstract write method.

    \n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "

    Define how to write a streaming micro batch after transforming it.

    \n\n

    This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the foreachBatch spark write method.

    \n
    \n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "

    Get the result of the transformations applied to a micro batch dataframe.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The transformed dataframe.

    \n
    \n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "

    Define which streaming trigger will be used.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A dict containing streaming trigger.

    \n
    \n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "

    Run the data quality process in a streaming micro batch dataframe.

    \n\n

    Iterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.

    \n\n
    Arguments:
    \n\n\n\n

    Returns: the validated dataframe.

    \n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "

    Module for writer factory.

    \n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "

    Class for writer factory.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "

    Get a writer according to the output specification using a factory pattern.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Writer: writer that will write the data.

    \n
    \n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "

    Package containing the writers responsible for writing data.

    \n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "

    Module to define behaviour to write to console.

    \n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "

    Class to write data to console.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "

    Construct ConsoleWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "

    Write data to console.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "

    Module to define behaviour to write to dataframe.

    \n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "

    Class to write data to dataframe.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "

    Construct DataFrameWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "

    Write data to dataframe.

    \n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "

    Module to define the behaviour of delta merges.

    \n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "

    Class to merge data using delta lake.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "

    Construct DeltaMergeWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "

    Merge new data with current data.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "

    Module to define behaviour to write to files.

    \n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "

    Class to write data to files.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "

    Construct FileWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "

    Write data to files.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "

    Module that defines the behaviour to write to JDBC targets.

    \n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "

    Class to write to JDBC targets.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "

    Construct JDBCWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "

    Write data into JDBC target.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "

    Module that defines the behaviour to write to Kafka.

    \n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "

    Class to write to a Kafka target.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "

    Construct KafkaWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "

    Write data to Kafka.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "

    Module that defines the behaviour to write to tables.

    \n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "

    Class to write to a table.

    \n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "

    Construct TableWriter instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "

    Write data to a table.

    \n\n

    After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "

    Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).

    \n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "

    Defines change data feed processor behaviour.

    \n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "

    Change data feed processor class.

    \n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "

    Expose CDF to external location.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "

    Delete old data from cdf delta table.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "

    Vacuum old data from cdf delta table.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "

    Module with dataset optimizer terminator.

    \n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "

    Class with dataset optimizer terminator.

    \n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "

    Optimize a dataset based on a set of pre-conceived optimizations.

    \n\n

    Most of the time the dataset is a table, but it can be a file-based one only.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "

    Module with notification terminator.

    \n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "

    Abstract Notification class.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "

    Construct Notification instances.

    \n\n
    Arguments:
    \n\n\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "

    Abstract create notification method.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "

    Abstract send notification method.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "

    Check if given notification is a failure notification.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A boolean telling if the notification is a failure notification

    \n
    \n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "

    Module for notifier factory.

    \n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "

    Class for notification factory.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "

    Get a notifier according to the terminator specs using a factory.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Notifier: notifier that will handle notifications.

    \n
    \n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "

    Check if it is necessary to send a failure notification and generate it.

    \n\n
    Arguments:
    \n\n\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "

    Notifications module.

    \n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "

    Module with email notifier.

    \n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "

    Base Notification class.

    \n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "

    Construct Email Notification instance.

    \n\n
    Arguments:
    \n\n\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "

    Creates the notification to be sent.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "

    Sends the notification by using a series of methods.

    \n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "

    Email notification templates.

    \n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "

    Templates for notifications.

    \n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "

    Module with sensor terminator.

    \n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "

    Sensor Terminator class.

    \n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "

    Update internal sensor status.

    \n\n

    Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "

    Module with spark terminator.

    \n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "

    Spark Terminator class.

    \n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "

    Terminate spark session.

    \n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "

    Module with the factory pattern to return terminators.

    \n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "

    TerminatorFactory class following the factory pattern.

    \n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "

    Execute a terminator following the factory pattern.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Transformer function to be executed in .transform() spark function.

    \n
    \n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "

    Package to define transformers available in the lakehouse engine.

    \n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "

    Aggregators module.

    \n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "

    Class containing all aggregation functions.

    \n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "

    Get the maximum value of a given column of a dataframe.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "

    Column creators transformers module.

    \n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "

    Class containing all functions that can create columns to add value.

    \n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "

    Create a sequential but not consecutive id.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "

    Create a sequential and consecutive id.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cls, output_col: str = 'lhe_row_id', rdd: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "

    Create columns given a map of column names and literal values (constants).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Callable: A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "

    Module with column reshaping transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "

    Class containing column reshaping transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "

    Cast specific columns into the designated type.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "

    Select specific columns with specific output aliases.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "

    Flatten the schema of the dataframe.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "

    Explode columns with types like ArrayType and MapType.

    \n\n

    After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "

    Execute Spark SQL expressions to create the specified columns.

    \n\n

    This function uses the Spark expr function. Check here.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "

    Rename specific columns into the designated name.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "

    Select all attributes from avro.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "

    Select all attributes from avro using a schema registry.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "

    Convert a json string into a json column (struct).

    \n\n

    The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False,\tdisable_dbfs_retry: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "

    Convert dataframe columns into a json value.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "

    Condensers module.

    \n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "

    Class containing all the functions to condensate data for later merges.

    \n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "

    Condense Change Data Capture (CDC) based on record_mode strategy.

    \n\n

    This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "

    Condense data based on a simple group by + take latest mechanism.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "

    Custom transformers module.

    \n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "

    Class representing a CustomTransformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "

    Execute a custom transformation provided by the user.

    \n\n

    This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.

    \n\n
    \n\n
    Attention!
    \n\n

    Please bear in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame,\nbecause it is how Spark's .transform method is able to chain the\ntransformations.

    \n\n
    \n\n

    Example:

    \n\n
    \n
    def my_custom_logic(df: DataFrame) -> DataFrame:\n
    \n
    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.

    \n
    \n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "

    Module with data masking transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "

    Class containing data masking transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "

    Mask specific columns using an hashing approach.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "

    Drop specific columns.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "

    Module containing date transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "

    Class with set of transformers to transform dates in several forms.

    \n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "

    Add column with current date.

    \n\n

    The current date comes from the driver as a constant, not from every executor.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "

    Convert multiple string columns with a source format into dates.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "

    Convert multiple string columns with a source format into timestamps.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "

    Convert multiple date/timestamp columns into strings with the target format.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "

    Create day/month/week/quarter/year hierarchy for the provided date columns.

    \n\n

    Uses Spark's extract function.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "

    Module for all the transformers exceptions.

    \n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "

    Exception for when a user provides wrong arguments to a transformer.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "

    Exception for when a user requests a transformer not supported in streaming.

    \n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "

    Module containing the filters transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "

    Class containing the filters transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "

    Incrementally Filter a certain dataframe given an increment logic.

    \n\n

    This logic can either be an increment value or an increment dataframe from\nwhich the get the latest value from. By default, the operator for the\nfiltering process is greater or equal to cover cases where we receive late\narriving data not cover in a previous load. You can change greater_or_equal\nto false to use greater, when you trust the source will never output more data\nwith the increment after you have load the data (e.g., you will never load\ndata until the source is still dumping data, which may cause you to get an\nincomplete picture of the last arrived data).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "

    Filter a dataframe based on an expression.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "

    Filter a dataframe's columns based on a list of SQL expressions.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "

    Drop duplicate rows using spark function dropDuplicates().

    \n\n

    This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "

    Module with join transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "

    Class containing join transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "

    Join two dataframes based on specified type and columns.

    \n\n

    Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "

    Module with null handlers transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "

    Class containing null handler transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "

    Replace nulls in a dataframe.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "

    Optimizers module.

    \n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "

    Class containing all the functions that can provide optimizations.

    \n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "

    Caches the current dataframe.

    \n\n

    The default storage level used is MEMORY_AND_DISK.

    \n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "

    Caches the current dataframe with a specific StorageLevel.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "

    Removes the dataframe from the disk and memory.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "

    Regex transformers module.

    \n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "

    Class containing all regex functions.

    \n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "

    Get the result of applying a regex to an input column (via regexp_extract).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed in the .transform() spark function.

    \n
    \n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "

    Module with repartitioners transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "

    Class containing repartitioners transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "

    Coalesce a dataframe into n partitions.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "

    Repartition a dataframe into n partitions.

    \n\n

    If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "

    Module with the factory pattern to return transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "

    TransformerFactory class following the factory pattern.

    \n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "

    Get a transformer following the factory pattern.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Transformer function to be executed in .transform() spark function.

    \n
    \n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "

    Module with union transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "

    Class containing union transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "

    Union dataframes, resolving columns by position (not by name).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "

    Union dataframes, resolving columns by name (not by position).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "

    Watermarker module.

    \n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "

    Class containing all watermarker transformers.

    \n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "

    Get the dataframe with watermarker defined.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be executed on other transformers.

    \n
    \n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "

    Utilities package.

    \n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "

    Config utilities package.

    \n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "

    Module to read configurations.

    \n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "

    Config utilities class.

    \n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "

    Get acon based on a filesystem path or on a dict.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Dict representation of an acon.

    \n
    \n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None,\tdisable_dbfs_retry: bool = False) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "

    Get the lakehouse engine configuration file.

    \n\n
    Returns:
    \n\n
    \n

    Configuration dictionary

    \n
    \n", "signature": "(package: str = 'lakehouse_engine.configs') -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_engine_version", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_engine_version", "kind": "function", "doc": "

    Get Lakehouse Engine version from the installed packages.

    \n\n
    Returns:
    \n\n
    \n

    String of engine version.

    \n
    \n", "signature": "(cls) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "

    Read an acon (algorithm configuration) file.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The acon file content as a dict.

    \n
    \n", "signature": "(path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "

    Read a DDL file in Spark SQL format from a cloud object storage system.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Content of the SQL file.

    \n
    \n", "signature": "(path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.remove_sensitive_info", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.remove_sensitive_info", "kind": "function", "doc": "

    Remove sensitive info from a dictionary.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    dict without sensitive information.

    \n
    \n", "signature": "(cls, dict_to_replace: Union[dict, list]) -> Union[dict, list]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "

    Utilities for databricks operations.

    \n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "

    Databricks utilities class.

    \n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "

    Get db utils on databricks.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Dbutils from databricks.

    \n
    \n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "

    Get notebook context from running acon.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Dict containing databricks notebook context.

    \n
    \n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats", "modulename": "lakehouse_engine.utils.engine_usage_stats", "kind": "module", "doc": "

    Utilities for recording the engine activity.

    \n"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats", "kind": "class", "doc": "

    Engine Usage utilities class.

    \n"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats.store_engine_usage", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats.store_engine_usage", "kind": "function", "doc": "

    Collects and store Lakehouse Engine usage statistics.

    \n\n

    These statistics include the acon and other relevant information, such as\nthe lakehouse engine version and the functions/algorithms being used.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tacon: dict,\tfunc_name: str,\tcollect_engine_usage: str = None,\tspark_confs: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.engine_usage_stats.EngineUsageStats.get_spark_conf_values", "modulename": "lakehouse_engine.utils.engine_usage_stats", "qualname": "EngineUsageStats.get_spark_conf_values", "kind": "function", "doc": "

    Get information from spark session configurations.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, usage_stats: dict, spark_confs: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "

    Utilities to be used by custom expectations.

    \n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "

    Validates the test results of the custom expectations.

    \n\n

    If you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The result of the validation.

    \n
    \n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "

    Extraction utilities package.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "

    Utilities module for JDBC extraction processes.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "

    Standardize the types of extractions we can have from a JDBC source.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "

    \n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "

    \n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "

    Configurations available for an Extraction from a JDBC source.

    \n\n

    These configurations cover:

    \n\n\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617173044',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "

    Utils for managing data extraction from particularly relevant JDBC sources.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "

    Construct JDBCExtractionUtils.

    \n\n
    Arguments:
    \n\n\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "

    Helper to get additional Spark Options initially passed.

    \n\n

    If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).

    \n
    \n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "

    Get the predicates list, based on a predicates query.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    List containing the predicates to use to split the extraction from\n JDBC sources.

    \n
    \n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "

    Get the Spark options to extract data from a JDBC source.

    \n\n
    Returns:
    \n\n
    \n

    The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.

    \n
    \n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "

    Get an optimal upperBound to properly split a Spark JDBC extraction.

    \n\n
    Returns:
    \n\n
    \n

    Either an int, date or timestamp to serve as upperBound Spark JDBC option.

    \n
    \n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "

    Utilities module for SAP B4 extraction processes.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "

    Standardise the types of ADSOs we can have for Extractions from SAP B4.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "

    \n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "

    \n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "

    \n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "

    Configurations available for an Extraction from SAP B4.

    \n\n

    It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.

    \n\n

    These configurations cover:

    \n\n\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617173044',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "

    Utils for managing data extraction from SAP B4.

    \n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "

    Construct SAPB4ExtractionUtils.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "

    Get the data_target from the data_target option or derive it.

    \n\n

    By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A string with the data_target.

    \n
    \n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "

    Utilities module for SAP BW extraction processes.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "

    Configurations available for an Extraction from SAP BW.

    \n\n

    It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.

    \n\n

    These configurations cover:

    \n\n\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20240617173044',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "

    Utils for managing data extraction from particularly relevant JDBC sources.

    \n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "

    Construct SAPBWExtractionUtils.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "

    Get the changelog table, given an odsobject.

    \n\n
    Returns:
    \n\n
    \n

    String to use as changelog_table.

    \n
    \n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "

    Get the odsobject based on the provided options.

    \n\n

    With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A string with the odsobject.

    \n
    \n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "

    Utilities module for SFTP extraction processes.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "

    Formats of algorithm input.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "

    Standardize the types of filters we can have from a SFTP source.

    \n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "

    \n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "

    Utils for managing data extraction from particularly relevant SFTP sources.

    \n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "

    Get a list of files to be extracted from SFTP.

    \n\n

    The arguments (options_args) to list files are:

    \n\n\n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A list containing the file names to be passed to Spark.

    \n
    \n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "

    Get the SFTP client.

    \n\n

    The SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.

    \n
    \n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "

    Validate the file extension based on the format definitions.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The string validated and formatted.

    \n
    \n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "

    Validate the location. Add \"/\" in the case it does not exist.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The location validated.

    \n
    \n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "

    Utilities for file name based operations.

    \n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "

    Function to retrieve list of file names in a folder.

    \n\n

    This function filters by file type and removes the extension of the file name\nit returns.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A list of file names without file type.

    \n
    \n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils.get_directory_path", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_directory_path", "kind": "function", "doc": "

    Add '/' to the end of the path of a directory.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Directory path stripped and with '/' at the end.

    \n
    \n", "signature": "(path: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils", "modulename": "lakehouse_engine.utils.gab_utils", "kind": "module", "doc": "

    Module to define GAB Utility classes.

    \n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils", "kind": "class", "doc": "

    Class containing utility functions for GAB.

    \n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.logger", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.logger", "kind": "function", "doc": "

    Store the execution of each stage in the log events table.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tself,\trun_start_time: datetime.datetime,\trun_end_time: datetime.datetime,\tstart: str,\tend: str,\tquery_id: str,\tquery_label: str,\tcadence: str,\tstage_file_path: str,\tquery: str,\tstatus: str,\terror_message: Union[Exception, str],\ttarget_database: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_json_column_as_dict", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_json_column_as_dict", "kind": "function", "doc": "

    Get JSON column as dictionary.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tlookup_query_builder: pyspark.sql.dataframe.DataFrame,\tquery_id: str,\tquery_column: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.extract_columns_from_mapping", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.extract_columns_from_mapping", "kind": "function", "doc": "

    Extract and transform columns to SQL select statement.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tcolumns: dict,\tis_dimension: bool,\textract_column_without_alias: bool = False,\ttable_alias: Optional[str] = None,\tis_extracted_value_as_name: bool = True) -> Union[tuple[list[str], list[str]], list[str]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_cadence_configuration_at_end_date", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_cadence_configuration_at_end_date", "kind": "function", "doc": "

    A dictionary that corresponds to the conclusion of a cadence.

    \n\n

    Any end date inputted by the user we check this end date is actually end of\n a cadence (YEAR, QUARTER, MONTH, WEEK).\nIf the user input is 2024-03-31 this is a month end and a quarter end that\n means any use cases configured as month or quarter need to be calculated.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, end_date: datetime.datetime) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.get_reconciliation_cadences", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.get_reconciliation_cadences", "kind": "function", "doc": "

    Get reconciliation cadences based on the use case configuration.

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tself,\tcadence: str,\tselected_reconciliation_window: dict,\tcadence_configuration_at_end_date: dict,\trerun_flag: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABUtils.format_datetime_to_default", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABUtils.format_datetime_to_default", "kind": "function", "doc": "

    Format datetime to GAB default format.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, date_to_format: datetime.datetime) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils", "kind": "class", "doc": "

    Class to extract a partition based in a date period.

    \n"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils.get_years", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils.get_years", "kind": "function", "doc": "

    Return a list of distinct years from the input parameters.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, start_date: str, end_date: str) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.gab_utils.GABPartitionUtils.get_partition_condition", "modulename": "lakehouse_engine.utils.gab_utils", "qualname": "GABPartitionUtils.get_partition_condition", "kind": "function", "doc": "

    Return year,month and day partition statement from the input parameters.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, start_date: str, end_date: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "

    Module to configure project logging.

    \n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "

    Logging filter to hide sensitive data from being shown in the logs.

    \n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "

    Hide sensitive information from being shown in the logs.

    \n\n

    Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    The transformed record to be logged.

    \n
    \n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "

    Handle the logging of the lakehouse engine project.

    \n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "

    Construct a LoggingHandler instance.

    \n\n
    Arguments:
    \n\n\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "

    Get the _logger instance variable.

    \n\n
    Returns
    \n\n
    \n

    the logger object.

    \n
    \n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "

    Utilities to facilitate dataframe schema management.

    \n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "

    Schema utils that help retrieve and manage schemas of dataframes.

    \n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "

    Get a spark schema from a file (spark StructType json file) in a file system.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Spark schema struct type.

    \n
    \n", "signature": "(\tfile_path: str,\tdisable_dbfs_retry: bool = False) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "

    Get a dict with the spark schema from a file in a file system.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Spark schema in a dict.

    \n
    \n", "signature": "(file_path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "

    Get a spark schema from a dict.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Spark schema struct type.

    \n
    \n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "

    Get a spark schema from a table.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Spark schema struct type.

    \n
    \n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "

    Get a spark schema from an input specification.

    \n\n

    This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    spark schema struct type.

    \n
    \n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "

    Recursive method to flatten the schema of the dataframe.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    A function to be called in .transform() spark function.

    \n
    \n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils", "modulename": "lakehouse_engine.utils.sql_parser_utils", "kind": "module", "doc": "

    Module to parse sql files.

    \n"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils.SQLParserUtils", "modulename": "lakehouse_engine.utils.sql_parser_utils", "qualname": "SQLParserUtils", "kind": "class", "doc": "

    Parser utilities class.

    \n"}, {"fullname": "lakehouse_engine.utils.sql_parser_utils.SQLParserUtils.split_sql_commands", "modulename": "lakehouse_engine.utils.sql_parser_utils", "qualname": "SQLParserUtils.split_sql_commands", "kind": "function", "doc": "

    Read the sql commands of a file to choose how to split them.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    List with the sql commands.

    \n
    \n", "signature": "(\tself,\tsql_commands: str,\tdelimiter: str,\tadvanced_parser: bool) -> list[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "

    Utilities to interact with storage systems.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "kind": "module", "doc": "

    Module to represent a DBFS file storage system.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage", "kind": "class", "doc": "

    Class to represent a DBFS file storage system.

    \n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage.get_file_payload", "kind": "function", "doc": "

    Get the content of a file.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    File payload/content.

    \n
    \n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.dbfs_storage.DBFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.dbfs_storage", "qualname": "DBFSStorage.write_payload_to_file", "kind": "function", "doc": "

    Write payload into a file.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "

    Module for abstract representation of a storage system holding files.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "

    Abstract file storage class.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "

    Get the payload of a file.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    File payload/content.

    \n
    \n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "

    Write payload into a file.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "

    Module for common file storage functions.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "

    Class for common file storage functions.

    \n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "

    Read a json file.

    \n\n

    The file should be in a supported file system (e.g., s3, dbfs or\nlocal filesystem).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Dict with json file content.

    \n
    \n", "signature": "(cls, path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_sql", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_sql", "kind": "function", "doc": "

    Read a sql file.

    \n\n

    The file should be in a supported file system (e.g., s3, dbfs or local\nfilesystem).

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    Content of the SQL file.

    \n
    \n", "signature": "(cls, path: str, disable_dbfs_retry: bool = False) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.write_payload", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.write_payload", "kind": "function", "doc": "

    Write payload into a file.

    \n\n

    The file should be in a supported file system (e.g., s3, dbfs or local\nfilesystem).

    \n\n
    Arguments:
    \n\n\n", "signature": "(\tcls,\tpath: str,\turl: urllib.parse.ParseResult,\tcontent: str,\tdisable_dbfs_retry: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.is_boto3_configured", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.is_boto3_configured", "kind": "function", "doc": "

    Check if boto3 is able to locate credentials and properly configured.

    \n\n

    If boto3 is not properly configured, we might want to try a different reader.

    \n", "signature": "() -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "

    Module to represent a local file storage system.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "

    Class to represent a local file storage system.

    \n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "

    Get the payload of a file.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    file payload/content.

    \n
    \n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "

    Write payload into a file.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "

    Module to represent a s3 file storage system.

    \n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "

    Class to represent a s3 file storage system.

    \n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "

    Get the payload of a config file.

    \n\n
    Arguments:
    \n\n\n\n
    Returns:
    \n\n
    \n

    File payload/content.

    \n
    \n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "

    Write payload into a file.

    \n\n
    Arguments:
    \n\n\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine_usage", "modulename": "lakehouse_engine_usage", "kind": "module", "doc": "

    How to use the Lakehouse Engine?

    \n\n

    Lakehouse engine usage examples for all the algorithms and other core functionalities.

    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader", "modulename": "lakehouse_engine_usage.data_loader", "kind": "module", "doc": "

    Data Loader

    \n\n

    How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?

    \n\n

    An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a\nconfiguration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the\nalgorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The\nACON is the configuration providing the behaviour of a lakehouse engine algorithm. You can check the algorithm code, and\nhow it interprets the ACON here.\nIn this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data\nengineering scenarios.\nCheck the underneath pages to find several ACON examples that cover many data extraction, transformation and loading scenarios.

    \n\n

    Overview of the Structure of the ACON file for DataLoads

    \n\n

    An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available\nspecifications are:

    \n\n\n\n

    Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table.

    \n\n
    \n\n

    spec_id is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "orders_bronze",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json",\n      "with_filepath": True,\n      "options": {\n        "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/",\n        "header": False,\n        "delimiter": "\\u005E",\n        "dateFormat": "yyyyMMdd"\n      },\n      "location": "s3://my-data-product-bucket/bronze/orders/"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "orders_bronze_with_extraction_date",\n      "input_id": "orders_bronze",\n      "transformers": [\n        {\n          "function": "with_row_id"\n        },\n        {\n          "function": "with_regex_value",\n          "args": {\n            "input_col": "lhe_extraction_filepath",\n            "output_col": "extraction_date",\n            "drop_input_col": True,\n            "regex": ".*WE_SO_SCL_(\\\\d+).csv"\n          }\n        }\n      ]\n    }\n  ],\n  "dq_specs": [\n    {\n      "spec_id": "check_orders_bronze_with_extraction_date",\n      "input_id": "orders_bronze_with_extraction_date",\n      "dq_type": "validator",\n      "result_sink_db_table": "my_database.my_table_dq_checks",\n      "fail_on_error": False,\n      "dq_functions": [\n        {\n          "dq_function": "expect_column_values_to_not_be_null",\n          "args": {\n            "column": "omnihub_locale_code"\n          }\n        },\n        {\n          "dq_function": "expect_column_unique_value_count_to_be_between",\n          "args": {\n            "column": "product_division",\n            "min_value": 10,\n            "max_value": 100\n          }\n        },\n        {\n          "dq_function": "expect_column_max_to_be_between",\n          "args": {\n            "column": "so_net_value",\n            "min_value": 10,\n            "max_value": 1000\n          }\n        },\n        {\n          "dq_function": "expect_column_value_lengths_to_be_between",\n          "args": {\n            "column": "omnihub_locale_code",\n            "min_value": 1,\n            "max_value": 10\n          }\n        },\n        {\n          "dq_function": "expect_column_mean_to_be_between",\n          "args": {\n            "column": "coupon_code",\n            "min_value": 15,\n            "max_value": 20\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "orders_silver",\n      "input_id": "check_orders_bronze_with_extraction_date",\n      "data_format": "delta",\n      "write_type": "merge",\n      "partitions": [\n        "order_date_header"\n      ],\n      "merge_opts": {\n        "merge_predicate": """\n            new.sales_order_header = current.sales_order_header\n            and new.sales_order_schedule = current.sales_order_schedule\n            and new.sales_order_item=current.sales_order_item\n            and new.epoch_status=current.epoch_status\n            and new.changed_on=current.changed_on\n            and new.extraction_date=current.extraction_date\n            and new.lhe_batch_id=current.lhe_batch_id\n            and new.lhe_row_id=current.lhe_row_id\n        """,\n        "insert_only": True\n      },\n      "db_table": "my_database.my_table_with_dq",\n      "location": "s3://my-data-product-bucket/silver/order_events_with_dq/",\n      "with_batch_id": True,\n      "options": {\n        "checkpointLocation": "s3://my-data-product-bucket/checkpoints/order_events_with_dq/"\n      }\n    }\n  ],\n  "terminate_specs": [\n    {\n      "function": "optimize_dataset",\n      "args": {\n        "db_table": "my_database.my_table_with_dq"\n      }\n    }\n  ],\n  "exec_env": {\n    "spark.databricks.delta.schema.autoMerge.enabled": True\n  }\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    Input Specifications

    \n\n

    You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that\nlist, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver,\ngold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g.,\njoins or incremental filtering on one dataset based on the values of another\none), therefore you can use one or more elements.\nMore information about InputSpecs.

    \n\n
    Relevant notes
    \n\n\n\n

    Transform Specifications

    \n\n

    In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input.

    \n\n

    If you look into the example above we ask the lakehouse engine to execute two functions on the orders_bronze input\ndata: with_row_id and with_regex_value. Those functions can of course receive arguments. You can see a list of all\navailable transformation functions (transformers) here lakehouse_engine.transformers. Then, you just invoke them in\nyour ACON as demonstrated above, following exactly the same function name and parameters name as described in the code\ndocumentation. \nMore information about TransformSpec.

    \n\n
    Relevant notes
    \n\n\n\n

    Data Quality Specifications

    \n\n

    One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you\nfrom loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process\nincludes one main feature at the moment:

    \n\n\n\n

    The output of the data quality process can be written into a Result Sink target (e.g. table or files) and is integrated with a Data Docs website, which can be a company-wide available website for people to check the quality of their data and share with others.

    \n\n

    To achieve all of this functionality the lakehouse engine uses Great Expectations internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above).

    \n\n
    \n\n

    The names of the functions and args are a 1 to 1 match of Great Expectations API.

    \n\n
    \n\n

    More information about DQSpec.

    \n\n
    Relevant notes
    \n\n\n\n

    Output Specifications

    \n\n

    The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). More information about OutputSpec.

    \n\n
    Relevant notes
    \n\n\n\n

    Terminate Specifications

    \n\n

    The terminate_specs section of the ACON is responsible for some \"wrapping up\" activities like optimising a table,\nvacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g.,\nreconciliation processes), but for now we have the following terminators.\nThis stage is fully optional, you can omit it from the ACON.\nThe most relevant now in the context of the lakehouse initiative are the following:

    \n\n\n\n

    More information about TerminatorSpec.

    \n\n

    Execution Environment

    \n\n

    In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the\nexecution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no\ncustom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON.

    \n\n
    \n\n

    Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already\nrunning need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section.\nThis section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make\nsure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties\nto be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a\nconfig that can only be set up before the cluster is running.

    \n\n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.append_load_from_jdbc_with_permissive_mode", "modulename": "lakehouse_engine_usage.data_loader.append_load_from_jdbc_with_permissive_mode", "kind": "module", "doc": "

    Append Load from JDBC with PERMISSIVE mode (default)

    \n\n

    This scenario is an append load from a JDBC source (e.g., SAP BW, Oracle Database, SQL Server Database...).

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "jdbc_args": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db",\n        "table": "jdbc_permissive",\n        "properties": {\n          "driver": "org.sqlite.JDBC"\n        }\n      },\n      "options": {\n        "numPartitions": 1\n      }\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "db_table": "test_db.jdbc_permissive_table"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "date"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "appended_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "date",\n            "increment_df": "max_sales_bronze_date"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "appended_sales",\n      "write_type": "append",\n      "db_table": "test_db.jdbc_permissive_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant notes
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.append_load_with_failfast", "modulename": "lakehouse_engine_usage.data_loader.append_load_with_failfast", "kind": "module", "doc": "

    Append Load with FAILFAST

    \n\n

    This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "enforce_schema_from_table": "test_db.failfast_table",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "mode": "FAILFAST"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "db_table": "test_db.failfast_table"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "date"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "appended_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "date",\n            "increment_df": "max_sales_bronze_date"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "appended_sales",\n      "write_type": "append",\n      "db_table": "test_db.failfast_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant notes
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.batch_delta_load_init_delta_backfill_with_merge", "modulename": "lakehouse_engine_usage.data_loader.batch_delta_load_init_delta_backfill_with_merge", "kind": "module", "doc": "

    Batch Delta Load Init, Delta and Backfill with Merge

    \n\n

    This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed.

    \n\n

    Init Load

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant Notes
    \n\n\n\n

    Delta Load

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_timestamp",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "actrequest_timestamp"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "actrequest_timestamp",\n            "increment_df": "max_sales_bronze_timestamp"\n          }\n        },\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant Notes
    \n\n
    \n\n

    \n
  • The order of the predicates in the ACON does not matter, is the logic in the lakehouse engine DeltaMergeWriter's \"_merge\" function that matters.
  • \n
  • Notice the \"<=>\" operator? In Spark SQL that's the null safe equal.
  • \n\n\n

    Backfilling

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": True,\n        "delimiter": "|",\n        "inferSchema": True\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"\n    },\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "batch",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "max_sales_bronze_timestamp",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "get_max_value",\n          "args": {\n            "input_col": "actrequest_timestamp"\n          }\n        }\n      ]\n    },\n    {\n      "spec_id": "condensed_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "incremental_filter",\n          "args": {\n            "input_col": "actrequest_timestamp",\n            "increment_value": "20180110120052t",\n            "greater_or_equal": True\n          }\n        },\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "condensed_sales",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Relevant Notes
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.custom_transformer", "modulename": "lakehouse_engine_usage.data_loader.custom_transformer", "kind": "module", "doc": "

    Custom Transformer

    \n\n

    There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases.

    \n\n

    Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable.

    \n\n
    \n\n
    Force Streaming Micro Batch Processing.
    \n\n

    When you use streaming mode, with a custom transformer, it\u2019s\nhighly advisable that you set the force_streaming_microbatch_processing flag to True in the transform specification, as\nexplained above!

    \n\n
    \n\n

    What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic?

    \n\n

    We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source.\nThe low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source.

    \n\n

    However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy\non transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have\nget into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based\napproach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that\nallows us to pass custom transformers where you put your entire pyspark logic and can pass it as an argument\nin the ACON (the configuration file that configures every lakehouse engine algorithm).

    \n\n

    Motivation:

    \n\n

    Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :)

    \n\n

    Custom transformation Function

    \n\n

    The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine

    \n\n
    \n\n
    Attention!!!
    \n\n

    For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like python partials, for example.

    \n\n
    \n\n
    \n
    def get_new_data(df: DataFrame) -> DataFrame:\n    """Get the new data from the lakehouse engine reader and prepare it."""\n    return (\n        df.withColumn("amount", when(col("_change_type") == "delete", lit(0)).otherwise(col("amount")))\n        .select("article_id", "order_date", "amount")\n        .groupBy("article_id", "order_date")\n        .agg(sum("amount").alias("amount"))\n    )\n\n\ndef get_joined_data(new_data_df: DataFrame, current_data_df: DataFrame) -> DataFrame:\n    """Join the new data with the current data already existing in the target dataset."""\n    return (\n        new_data_df.alias("new_data")\n        .join(\n            current_data_df.alias("current_data"),\n            [\n                new_data_df.article_id == current_data_df.article_id,\n                new_data_df.order_date == current_data_df.order_date,\n            ],\n            "left_outer",\n        )\n        .withColumn(\n            "current_amount", when(col("current_data.amount").isNull(), lit(0)).otherwise("current_data.amount")\n        )\n        .withColumn("final_amount", col("current_amount") + col("new_data.amount"))\n        .select(col("new_data.article_id"), col("new_data.order_date"), col("final_amount").alias("amount"))\n    )\n\n\ndef calculate_kpi(df: DataFrame) -> DataFrame:\n    """Calculate KPI through a custom transformer that will be provided in the ACON.\n\n    Args:\n        df: DataFrame passed as input.\n\n    Returns:\n        DataFrame: the transformed DataFrame.\n    """\n    new_data_df = get_new_data(df)\n\n    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n    current_data_df = ExecEnv.SESSION.table(\n        "my_database.my_table"\n    )\n\n    transformed_df = get_joined_data(new_data_df, current_data_df)\n\n    return transformed_df\n
    \n
    \n\n

    Don't like pyspark API? Write SQL

    \n\n

    You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of\nthe auxiliary functions you decide to develop) you can write something like:

    \n\n
    \n
    def calculate_kpi(df: DataFrame) -> DataFrame:\n    df.createOrReplaceTempView("new_data")\n\n    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n    ExecEnv.SESSION.sql(\n        """\n          CREATE OR REPLACE TEMP VIEW my_kpi AS\n          SELECT ... FROM new_data ...\n        """\n    )\n\n    return ExecEnv.SESSION.table("my_kpi")\n
    \n
    \n\n

    Just your regular ACON

    \n\n

    If you notice the ACON below, everything is the same as you would do in a Data Product, but the transform_specs section of the ACON has a difference, which is a function called \"custom_transformation\" where we supply as argument the function defined above with the pyspark code.

    \n\n
    \n\n
    Attention!!!
    \n\n

    Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "sales",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "db_table": "my_database.dummy_sales",\n            "options": {"readChangeFeed": "true"},\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "transformed_sales_kpi",\n            "input_id": "sales",\n            # because we are using streaming, this allows us to make sure that\n            # all the computation in our custom transformer gets pushed to\n            # Spark's foreachBatch method in a stream, which allows us to\n            # run all Spark functions in a micro batch DataFrame, as there\n            # are some Spark functions that are not supported in streaming.\n            "force_streaming_foreach_batch_processing": True,\n            "transformers": [\n                {\n                    "function": "custom_transformation",\n                    "args": {"custom_transformer": calculate_kpi},\n                },\n            ],\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "my_table_quality",\n            "input_id": "transformed_sales_kpi",\n            "dq_type": "validator",\n            "bucket": "my_dq_bucket",\n            "data_docs_bucket": "my_data_product_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "expectations_store_prefix": "dq/expectations/",\n            "validations_store_prefix": "dq/validations/",\n            "checkpoint_store_prefix": "dq/checkpoints/",\n            "tbl_to_derive_pk": "my_table",\n            "dq_functions": [\n                {"function": "expect_column_values_to_not_be_null", "args": {"column": "article_id"}},\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "sales_kpi",\n            "input_id": "transformed_sales_kpi",\n            "write_type": "merge",\n            "data_format": "delta",\n            "db_table": "my_database.my_table",\n            "options": {\n                "checkpointLocation": "s3://my_data_product_bucket/gold/my_table",\n            },\n            "merge_opts": {\n                "merge_predicate": "new.article_id = current.article_id AND new.order_date = current.order_date"\n            },\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sap_b4_adso", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sap_b4_adso", "kind": "module", "doc": "

    Extract from SAP B4 ADSOs

    \n\n

    A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from\nSAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions\n(AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...),\nonly requiring a few parameters that are explained and exemplified in the\ntemplate scenarios that we have created.

    \n\n
    \n\n
    This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example.
    \n\n
    \n\n

    How can I find a good candidate column for partitioning the extraction from S4Hana?

    \n\n
    \n\n
    Parallelization Limitations
    \n\n

    There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source.

    \n\n

    E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source.

    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n
    \n\n
    In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).
    \n\n

    In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those.

    \n\n
    \n\n

    You can check the code documentation of the reader below:

    \n\n

    SAP B4 Reader

    \n\n

    JDBC Extractions arguments

    \n\n

    SAP B4 Extractions arguments

    \n\n
    \n\n
    For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well.
    \n\n
    \n\n

    Extraction from SAP B4 ADSOs Template

    \n\n

    This template covers the following scenarios of extractions from the SAP B4Hana ADSOs:

    \n\n\n\n
    \n\n

    Note: the template will cover two ADSO Types:

    \n\n\n\n
    \n\n

    For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic\nbasically consists of joining the db_table (for AQ) or the changelog_table (for CL) with the table\nhaving the requests status (my_database.requests_status_table).\nOne of the fields used for this joining is the data_target, which has a relationship with the ADSO\n(db_table/changelog_table), being basically the same identifier without considering parts of it.

    \n\n

    Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to\n(this is a simplified version, for more details please refer to the lakehouse-engine code documentation):\nAQ Init Extraction:\nSELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table t

    \n\n

    AQ Delta Extraction:\nSELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table

    \n\n

    CL Init Extraction:\nSELECT t.*,\n {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,\n '0' AS datapakid,\n 0 AS record,\n CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_2 t

    \n\n

    CL Delta Extraction:\nSELECT tbl.*,\nCAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_3 AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`

    \n\n
    \n\n

    Introductory Notes:If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links:

    \n\n\n\n
    \n\n

    1 - The Simplest Scenario (Not parallel - Not Recommended)

    \n\n

    This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the ADSO\nyou want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source ADSO, there are two options:

    \n\n\n\n

    Below example is composed of two cells.

    \n\n\n\n
    \n\n

    There may be cases where you might want to always extract fully from the source ADSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "adso_type": "AQ",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented.

    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.

    \n\n

    On the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_sap_b4_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_simple/",\n                "adso_type": "AQ",\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_simple/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.

    \n\n

    Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_prov_upper/",\n                "adso_type": "AQ",\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 1000000,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_prov_upper/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction, Automatic upper_bound (Recommended)

    \n\n

    This scenario is very similar to 2.2, the only difference being that upperBound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "calculate_upper_bound": True,\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "dbtable": "my_database.my_table",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_calc_upper/",\n                "adso_type": "AQ",\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_par_calc_upper/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.4 - Parallel Extraction, Provide Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type CL,\nthe active table does not have the RECORD column, which is usually a good option for scenarios 2.2 and 2.3):

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenario 2.2 or 2.3.

    \n\n

    When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.

    \n\n

    Below the lakehouse function to generate predicate list automatically is presented.

    \n\n

    This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here, specially if you are applying filter on transformations spec,\nand you know entire table won't be necessary, so you can change it to something like this: select distinct(x)\nfrom table where x > y.

    \n\n

    predicates_add_null: You can decide if you want to consider null on predicates list or not, by default\nthis property is True.

    \n\n

    Example: for \"partition_column\": \"CALMONTH\"

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column = "CALMONTH"\ndbtable = "my_database.my_table_3"\n\npredicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partition_column,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_2_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "driver": "com.sap.db.jdbc.Driver",\n                "dbtable": "my_database.my_table_2",\n                "changelog_table": "my_database.my_table_3",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_prov_predicates/",\n                "adso_type": "CL",\n                "predicates": predicates,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_2_bronze",\n            "input_id": "my_identifier_2_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_2_prov_predicates/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.5 - Parallel Extraction, Generate Predicates

    \n\n

    This scenario is very similar to the scenario 2.4, with the only difference that it automatically\ngenerates the predicates (\"generate_predicates\": True).

    \n\n

    This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise \nthose would probably be recommended).

    \n\n

    When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumn LOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

    \n\n

    Example: for \"partitionColumn\": \"record\"\nGenerate predicates:

    \n\n\n\n

    Spark will generate 100 queries like this:

    \n\n\n\n

    Generate predicates will also consider null by default:

    \n\n\n\n

    To disable this behaviour the following variable value should be changed to false: \"predicates_add_null\": False

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_2_source",\n            "read_type": "batch",\n            "data_format": "sap_b4",\n            "generate_predicates": True,\n            "options": {\n                "url": "my_sap_b4_url",\n                "user": "my_user",\n                "password": "my_b4_hana_pwd",\n                "driver": "com.sap.db.jdbc.Driver",\n                "dbtable": "my_database.my_table_2",\n                "changelog_table": "my_database.my_table_3",\n                "extraction_type": extraction_type,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_gen_predicates/",\n                "adso_type": "CL",\n                "partitionColumn": "CALMONTH",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_2_bronze",\n            "input_id": "my_identifier_2_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/my_identifier_2_gen_predicates/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sap_bw_dso", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sap_bw_dso", "kind": "module", "doc": "

    Extract from SAP BW DSOs

    \n\n
    \n\n
    Parallelization Limitations
    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from \nSAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions \n(active table, changelog table, activation requests table, how to identify the next delta timestamp...), \nonly requiring a few parameters that are explained and exemplified in the \ntemplate scenarios that we have created.

    \n\n

    This page also provides you a section to help you figure out a good candidate for partitioning the extraction from SAP BW.

    \n\n

    You can check the code documentation of the reader below:

    \n\n

    SAP BW Reader

    \n\n

    JDBC Extractions arguments

    \n\n

    SAP BW Extractions arguments

    \n\n
    \n\n

    For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also \nthe ones listed in the JDBC extractions, as those are inherited as well.

    \n\n
    \n\n

    Extraction from SAP-BW template

    \n\n

    This template covers the following scenarios of extractions from the SAP BW DSOs:

    \n\n\n\n
    \n\n

    Introductory Notes: if you want to have a better understanding about JDBC Spark optimizations, \nhere you have a few useful links:

    \n\n\n\n
    \n\n

    1 - The Simplest Scenario (Not parallel - Not Recommended)

    \n\n

    This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques
    \nand using a single connection to retrieve all the data from the source. It should only be used in case the DSO \nyou want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source DSO, there are two options:

    \n\n\n\n

    Below example is composed of two cells.

    \n\n\n\n
    \n\n

    There may be cases where you might want to always extract fully from the source DSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs.

    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people does not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential. \nOn the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the example 1.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using\nthe following options:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.

    \n\n

    Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 3,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n                "upperBound": 42,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction, Automatic upper_bound (Recommended)

    \n\n

    This scenario is very similar to 2.2, the only difference being that upper_bound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "calculate_upper_bound": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 10,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.4 - Parallel Extraction, Backfilling

    \n\n

    This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and\nmade available in the bronze layer. By default, the delta extraction considers the max value of the column\nactrequest_timestamp on the data already extracted. However, there might be cases, in which you might want\nto extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you\ncan use the properties min_timestamp and max_timestamp.

    \n\n

    Below, a very similar example to the previous one is provided, the only differences being that\nthe properties \"min_timestamp\": \"20210910000000\" and \"max_timestamp\": \"20210913235959\" are not provided,\nmeaning it will extract the data from the changelog table, using a filter\n\"20210910000000\" > actrequest_timestamp <= \"20210913235959\", ignoring if some of the data is already\navailable in the destination or not. Moreover, note that the property latest_timestamp_data_location\ndoes not need to be provided, as the timestamps to be considered are being directly provided (if both\nthe timestamps and the latest_timestamp_data_location are provided, the last parameter will have no effect).\nAdditionally, \"extraction_type\": \"delta\" and \"write_type\": \"append\" is forced, instead of using the\nvariables as in the other examples, because the backfilling scenario only makes sense for delta extractions.

    \n\n
    \n\n

    Note: be aware that the backfilling example being shown has no mechanism to enforce that\nyou don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve\nany duplication in the silver layer or extract the delta with a merge strategy while writing to bronze,\ninstead of appending.

    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "calculate_upper_bound": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "changelog_table": "my_database.my_changelog_table",\n                "extraction_type": "delta",\n                "numPartitions": 10,\n                "partitionColumn": "my_partition_col",\n                "lowerBound": 1,\n                "min_timestamp": "20210910000000",\n                "max_timestamp": "20210913235959",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.5 - Parallel Extraction, Provide Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).

    \n\n

    When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.

    \n\n

    Below the lakehouse function to generate predicate list automatically is presented.

    \n\n

    This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here,\nspecially if you are applying filter on transformations spec, and you know entire table won't be necessary, so\nyou can change it to something like this: select distinct(x) from table where x > y.

    \n\n

    predicates_add_null: You can decide if you want to consider null on predicates list or not, by default this\nproperty is True.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column = "my_partition_column"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\ncolumn_for_predicates = partition_column\nuser = "my_user"\npassword = "my_hana_pwd"\nurl = "my_bw_url"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        dbtable=dbtable,\n        partition_column=partition_column,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "predicates": predicates,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.6 - Parallel Extraction, Generate Predicates (Recommended)

    \n\n

    This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:

    \n\n\n\n

    This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).

    \n\n

    When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumn LOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

    \n\n

    Example: for \"partitionColumn\": \"record\"\nGenerate predicates:

    \n\n\n\n

    Spark will generate 100 queries like this:

    \n\n\n\n

    Generate predicates will also consider null by default:

    \n\n\n\n

    To disable this behaviour the following variable value should be changed to false: \"predicates_add_null\": False

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "generate_predicates": True,\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "partitionColumn": "my_partition_col",\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    3 - Extraction from Write Optimized DSOs

    \n\n

    This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from\nWrite Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate\nchangelog table. Due to this reason, you need to specify that the changelog_table parameter value is equal\nto the dbtable parameter value.\nMoreover, these tables usually already include the changelog technical columns\nlike RECORD and DATAPAKID, for example, that the framework adds by default. Thus, you need to specify\n\"include_changelog_tech_cols\": False to change this behaviour.\nFinally, you also need to specify the name of the column in the table that can be used to join with the\nactivation requests table to get the timestamp of the several requests/deltas,\nwhich is \"actrequest\" by default (\"request_col_name\": 'request').

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nLOAD_TYPE = "INIT" or "DELTA"\n\nif LOAD_TYPE == "INIT":\n    extraction_type = "init"\n    write_type = "overwrite"\nelse:\n    extraction_type = "delta"\n    write_type = "append"\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "changelog_table": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "request_col_name": "request",\n                "include_changelog_tech_cols": False,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier/",\n                "extraction_type": extraction_type,\n                "numPartitions": 2,\n                "partitionColumn": "RECORD",\n                "lowerBound": 1,\n                "upperBound": 50000,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": write_type,\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table

    \n\n

    By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the\ncurrent timestamp) in the init extraction, however this may be causing problems when merging changes in silver,\nfor write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the\nact_req_table was added.

    \n\n

    This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to\nassume the value from the activation requests table (timestamp column).

    \n\n

    This feature is only available for WODSOs and to use it you need to specify \"get_timestamp_from_actrequest\": True.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "sap_bw",\n            "options": {\n                "user": "my_user",\n                "password": "my_hana_pwd",\n                "url": "my_sap_bw_url",\n                "dbtable": "my_database.my_table",\n                "changelog_table": "my_database.my_table",\n                "odsobject": "my_ods_object",\n                "request_col_name": "request",\n                "include_changelog_tech_cols": False,\n                "latest_timestamp_data_location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP/",\n                "extraction_type": "init",\n                "numPartitions": 2,\n                "partitionColumn": "RECORD",\n                "lowerBound": 1,\n                "upperBound": 50000,\n                "get_timestamp_from_act_request": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["actrequest_timestamp"],\n            "location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    How can we decide the partitionColumn?

    \n\n

    Compatible partitionColumn for upperBound/lowerBound Spark options:

    \n\n

    It needs to be int, date, timestamp \u2192 https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

    \n\n

    If you don't have any column to partition on those formats, you can use predicates to partition the table \u2192 https://docs.databricks.com/en/connect/external-systems/jdbc.html#manage-parallelism

    \n\n

    One of the most important parameters to optimise the extraction is the partitionColumn, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not.

    \n\n

    Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to \"create\" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column.

    \n\n
    \n\n
    Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the scenario 2.1 in the template above. It might make sense to use scenario 2.1 for the init and then scenario 2.2 or 2.3 for the subsequent deltas.
    \n\n
    \n\n

    When there is no int, date or timestamp good candidate for partitionColumn:

    \n\n

    In this case you can opt by the scenario 2.5 - Generate Predicates, which supports any kind of column to be defined as partitionColumn.

    \n\n

    However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the partitionColumn, so you can perform some analysis.

    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_from_sftp", "modulename": "lakehouse_engine_usage.data_loader.extract_from_sftp", "kind": "module", "doc": "

    Extract from SFTP

    \n\n

    Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.

    \n\n

    This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket.

    \n\n

    The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as dq_specs, output_specs, terminator_specs and transform_specs.

    \n\n

    Furthermore, this feature provides several filters on the directories that makes easier to control the extractions.

    \n\n

    Introductory Notes:

    \n\n

    There are important parameters that must be added to input specs in order to make the SFTP extraction work properly:

    \n\n
    \n\n
    Read type
    The engine supports only BATCH mode for this feature.
    \n\n
    \n\n

    sftp_files_format - File format that will be used to read data from SFTP. The engine supports: CSV, FWF, JSON and XML.

    \n\n

    location - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the file_name_contains option.

    \n\n

    options - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section.

    \n\n

    The options allowed are:

    \n\n\n\n\n \n \n \n \n\n\n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n \n \n \n \n\n\n
    Property typeDetailExampleComment
    Connectionadd_auto_policy(str)true of falseIndicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection.
    Connectionkey_type (str)\"Ed25519\" or \"RSA\"Indicates the key type to be used for the connection (SSH, Ed25519).
    Connectionkey_filename (str)\"/path/to/private_key/private_key.ppk\"The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use add_auto_policy.
    Connectionpkey (str)\"AAAAC3MidD1lVBI1NTE5AAAAIKssLqd6hjahPi9FBH4GPDqMqwxOMsfxTgowqDCQAeX+\"Value to use for the host key when connecting to the remote SFTP server.
    Filterdate_time_gt (str)\"1900-01-01\" or \"1900-01-01 08:59:59\"Filter the files greater than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
    Filterdate_time_lt (str)\"3999-12-31\" or \"3999-12-31 20:59:59\"Filter the files lower than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
    Filterearliest_file (bool)true or falseFilter the earliest dated file in the directory.
    Filterfile_name_contains (str)\"part_of_filename\"Filter files when match the pattern.
    Filterlatest_file (bool)true or falseFilter the most recent dated file in the directory.
    Read data from subdirectoriessub_dir (bool)true or falseThe engine will search files into subdirectories of the location. It will consider one level below the root location given.
    When sub_dir is used with latest_file/earliest_file argument, the engine will retrieve the latest/earliest file for each subdirectory.
    Add metadata infofile_metadata (bool)true or falseWhen this option is set as True, the dataframe retrieves the filename with location and the modification_time from the original files in sftp. It attaches these two columns adding the information to respective records.
    \n\n

    Useful Info & Links:

    \n\n
      \n
    1. Paramiko SSH Client
    2. \n
    3. Pandas documentation
    4. \n
    \n\n

    Scenario 1

    \n\n

    The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column \"created_on\" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option file_metadata) and the processing date from the engine.

    \n\n

    For an incremental load approach, it is advised to use the \"modification_time\" column created by the option file_metadata. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently.

    \n\n
    \n\n
    Below scenario uses \"add_auto_policy\": true, which is not recommended.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n      {\n          "spec_id": "sftp_source",\n          "read_type": "batch",\n          "data_format": "sftp",\n          "sftp_files_format": "csv",\n          "location": "my_sftp_data_path",\n          "options": {\n              "hostname": "my_sftp_hostname",\n              "username": "my_sftp_username",\n              "password": "my_sftp_password",\n              "port": "my_port",\n              "add_auto_policy": True,\n              "file_name_contains": "test_pattern",\n              "args": {"sep": "|"},\n              "latest_file": True,\n              "file_metadata": True\n          }\n      },\n  ],\n  "transform_specs": [\n      {\n          "spec_id": "sftp_transformations",\n          "input_id": "sftp_source",\n          "transformers": [\n              {\n                  "function": "with_literals",\n                  "args": {"literals": {"created_on": datetime.now()}},\n              },\n          ],\n      },\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sftp_bronze",\n      "input_id": "sftp_transformations",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "s3://my_path/dummy_table"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    Scenario 2

    \n\n

    The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys.

    \n\n

    For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes.

    \n\n
    \n\n\n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n  "input_specs": [\n      {\n          "spec_id": "sftp_source",\n          "read_type": "batch",\n          "data_format": "sftp",\n          "sftp_files_format": "json",\n          "location": "my_sftp_data_path",\n          "options": {\n              "hostname": "my_sftp_hostname",\n              "username": "my_sftp_username",\n              "password": "my_sftp_password",\n              "port": "my_port",\n              "key_type": "RSA",\n              "key_filename": "dbfs_mount_location/my_file_key.ppk",\n              "pkey": "my_key",\n              "latest_file": True,\n              "file_metadata": True,\n              "args": {"lines": True, "orient": "columns"},\n          },\n      },\n  ],\n  "transform_specs": [\n      {\n          "spec_id": "sftp_transformations",\n          "input_id": "sftp_source",\n          "transformers": [\n              {\n                  "function": "with_literals",\n                  "args": {"literals": {"lh_created_on": datetime.now()}},\n              },\n          ],\n      },\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sftp_bronze",\n      "input_id": "sftp_transformations",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "location": "s3://my_path/dummy_table"\n    }\n  ]\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.extract_using_jdbc_connection", "modulename": "lakehouse_engine_usage.data_loader.extract_using_jdbc_connection", "kind": "module", "doc": "

    Extract using JDBC connection

    \n\n
    \n\n
    SAP Extraction
    \n\n

    SAP is only used as an example to demonstrate how we can use a JDBC connection to extract data.

    \n\n

    If you are looking to extract data from SAP, please use our sap_b4 or sap_bw reader.

    \n\n

    You can find the sap_b4 reader documentation: Extract from SAP B4 ADSOs and the sap_bw reader documentarion: Extract from SAP BW DSOs

    \n\n
    \n\n
    \n\n
    Parallel Extraction
    \n\n

    Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    Introduction

    \n\n

    Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC.

    \n\n

    In the next section you will find several examples about how to do it.

    \n\n

    The Simplest Scenario using sqlite

    \n\n
    \n\n\n\n
    \n\n

    This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source.

    \n\n

    Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password.

    \n\n

    Same as spark, we provide two different ways to run jdbc reader.

    \n\n

    1 - We can use the jdbc() function, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options().

    \n\n

    2 - Other way is using .format(\"jdbc\") and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution.

    \n\n

    You can find and run the following code in our local test for the engine.

    \n\n

    jdbc() function

    \n\n

    As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the jdbc_args object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism.

    \n\n

    The below code is an example in how to use jdbc() function in our ACON.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "jdbc_args": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",\n        "table": "jdbc_function",\n        "properties": {\n          "driver": "org.sqlite.JDBC"\n        }\n      },\n      "options": {\n        "numPartitions": 1\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "db_table": "test_db.jdbc_function_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/correct_arguments/data"\n    }\n  ]\n}\n
    \n
    \n\n

    This is same as using the following code in pyspark:

    \n\n
    \n
    spark.read.jdbc(\n  url="jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",\n  table="jdbc_function",\n  properties={"driver":"org.sqlite.JDBC"})\n  .option("numPartitions", 1)\n
    \n
    \n\n

    .format(\"jdbc\")

    \n\n

    In this example we do not use the jdbc_args object. All the jdbc connection parameters are inside the dictionary with the object options.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "jdbc",\n      "options": {\n        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db",\n        "dbtable": "jdbc_format",\n        "driver": "org.sqlite.JDBC",\n        "numPartitions": 1\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "db_table": "test_db.jdbc_format_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/correct_arguments/data"\n    }\n  ]\n}\n
    \n
    \n\n

    This is same as using the following code in pyspark:

    \n\n
    \n
    spark.read.format("jdbc")\n    .option("url", "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db")\n    .option("driver", "org.sqlite.JDBC")\n    .option("dbtable", "jdbc_format")\n    .option("numPartitions", 1)\n
    \n
    \n\n

    Template with more complete and runnable examples

    \n\n

    In this template we will use a SAP as example for a more complete and runnable example.\nThese definitions can be used in several databases that allow JDBC connection.

    \n\n

    The following scenarios of extractions are covered:

    \n\n\n\n
    \n\n

    Disclaimer: This template only uses SAP as demonstration example for JDBC connection.\nThis isn't a SAP template!!!\nIf you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.

    \n\n
    \n\n

    The JDBC connection has 2 main sections to be filled, the jdbc_args and options:

    \n\n\n\n

    If you want to know more regarding jdbc spark options you can follow the link below:

    \n\n\n\n

    If you want to have a better understanding about JDBC Spark optimizations, you can find them in the following:

    \n\n\n\n\n\n

    This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the data\nyou want to extract from is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source, we can have two options:

    \n\n\n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_jdbc_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2 - Parallel extraction

    \n\n

    On this section we present 3 possible scenarios for parallel extractions from JDBC sources.

    \n\n
    \n\n

    Disclaimer for parallel extraction: Parallel extractions can bring a jdbc source down if a lot of stress\nis put on the system. Be careful when choosing the number of partitions. \nSpark is a distributed system and can lead to many connections.

    \n\n
    \n\n

    2.1 - Parallel Extraction, Simplest Scenario

    \n\n

    This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch experience around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.

    \n\n

    On the example bellow, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.

    \n\n
    Delta Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n                "numPartitions": 10,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n                "numPartitions": 10,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["REQTSN"],\n            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.2 - Parallel Extraction, Provide upper_bound (Recommended)

    \n\n

    This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:

    \n\n\n\n

    This is an adequate example to be followed if there is a column in the data source that is good to\nbe used as the partitionColumn. Comparing with the previous example,\nthe numPartitions and three additional options to fine tune the extraction (partitionColumn, lowerBound,\nupperBound) are provided.

    \n\n

    When these 4 properties are used, Spark will use them to build several queries to split the extraction.\nExample: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like:

    \n\n\n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 2000,\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "partitionColumn": "RECORD",\n                "numPartitions": 10,\n                "lowerBound": 1,\n                "upperBound": 2000,\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    2.3 - Parallel Extraction with Predicates (Recommended)

    \n\n

    This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't\nnumeric, date or timestamp columns to parallelize the extraction:

    \n\n\n\n\n\n

    When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.

    \n\n

    Bellow, a lakehouse function to generate predicate list automatically, is presented.

    \n\n

    By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.

    \n\n

    predicates_query: At the sample below the whole table (select distinct(x) from table) is being considered,\nbut it is possible to filter using predicates list here, specially if you are applying filter on\ntransformations spec, and you know entire table won't be necessary, so you can change it to something like this:\nselect distinct(x) from table where x > y.

    \n\n

    predicates_add_null: One can consider if null on predicates list or not. By default, this property is True.\nExample: for \"partitionColumn\": \"record\"

    \n\n
    Init - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\nExecEnv.get_or_create()\n\npartitionColumn = "my_partition_col"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""\ncolumn_for_predicates = partitionColumn\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\ndriver = "com.sap.db.jdbc.Driver"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partitionColumn,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "predicates": predicates,\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "my_identifier_source",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n\n
    Delta - Load data into the Bronze Bucket
    \n\n
    \n
    from lakehouse_engine.engine import load_data\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.extraction.jdbc_extraction_utils import (\n    JDBCExtraction,\n    JDBCExtractionUtils,\n)\nExecEnv.get_or_create()\n\npartitionColumn = "my_partition_col"\ndbtable = "my_database.my_table"\n\npredicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""\ncolumn_for_predicates = partitionColumn\nuser = "my_user"\npassword = "my_b4_hana_pwd"\nurl = "my_sap_b4_url"\ndriver = "com.sap.db.jdbc.Driver"\npredicates_add_null = True\n\njdbc_util = JDBCExtractionUtils(\n    JDBCExtraction(\n        user=user,\n        password=password,\n        url=url,\n        predicates_add_null=predicates_add_null,\n        partition_column=partitionColumn,\n        dbtable=dbtable,\n    )\n)\n\npredicates = jdbc_util.get_predicates(predicates_query)\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "my_identifier_source",\n            "read_type": "batch",\n            "data_format": "jdbc",\n            "jdbc_args": {\n                "url": "my_sap_b4_url",\n                "table": "my_database.my_table",\n                "predicates": predicates,\n                "properties": {\n                    "user": "my_user",\n                    "password": "my_b4_hana_pwd",\n                    "driver": "com.sap.db.jdbc.Driver",\n                },\n            },\n            "options": {\n                "fetchSize": 100000,\n                "compress": True,\n            },\n        },\n        {\n            "spec_id": "my_identifier_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "max_my_identifier_bronze_date",\n            "input_id": "my_identifier_bronze",\n            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],\n        },\n        {\n            "spec_id": "appended_my_identifier",\n            "input_id": "my_identifier_source",\n            "transformers": [\n                {\n                    "function": "incremental_filter",\n                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},\n                }\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "my_identifier_bronze",\n            "input_id": "appended_my_identifier",\n            "write_type": "append",\n            "data_format": "delta",\n            "partitions": ["RECORD"],\n            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.filtered_full_load", "modulename": "lakehouse_engine_usage.data_loader.filtered_full_load", "kind": "module", "doc": "

    Filtered Full Load

    \n\n

    This scenario is very similar to the full load, but it filters the data coming from the source, instead of doing a complete full load.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "filtered_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "expression_filter",\n          "args": {\n            "exp": "date like '2016%'"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "filtered_sales",\n      "write_type": "overwrite",\n      "data_format": "parquet",\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.filtered_full_load_with_selective_replace", "modulename": "lakehouse_engine_usage.data_loader.filtered_full_load_with_selective_replace", "kind": "module", "doc": "

    Filtered Full Load with Selective Replace

    \n\n

    This scenario is very similar to the Filtered Full Load, but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "filtered_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "expression_filter",\n          "args": {\n            "exp": "date like '2016%'"\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "filtered_sales",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "partitions": [\n        "date",\n        "customer"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data",\n      "options": {\n        "replaceWhere": "date like '2016%'"\n      }\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.flatten_schema_and_explode_columns", "modulename": "lakehouse_engine_usage.data_loader.flatten_schema_and_explode_columns", "kind": "module", "doc": "

    Flatten Schema and Explode Columns

    \n\n

    Related with schema, we can make two kind of operations:

    \n\n\n\n

    The below scenario of flatten_schema is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting max_level of 2.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "from_json",\n          "args": {\n            "input_col": "sample",\n            "schema": {\n              "type": "struct",\n              "fields": [\n                {\n                  "name": "field1",\n                  "type": "string",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field2",\n                  "type": "string",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field3",\n                  "type": "double",\n                  "nullable": true,\n                  "metadata": {}\n                },\n                {\n                  "name": "field4",\n                  "type": {\n                    "type": "struct",\n                    "fields": [\n                      {\n                        "name": "field1",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      },\n                      {\n                        "name": "field2",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      }\n                    ]\n                  },\n                  "nullable": true,\n                  "metadata": {}\n                }\n              ]\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data"\n    }\n  ]\n}\n
    \n
    \n\n

    The scenario of explode_arrays is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using explode_arrays as true.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "explode_columns",\n          "args": {\n            "explode_arrays": true\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data"\n    }\n  ]\n}\n
    \n
    \n\n

    The scenario of flatten_and_explode_arrays_and_maps is using flatten_schema and explode_columns to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps:

    \n\n
    1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct;\n2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer.\n3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer.\n
    \n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "json",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",\n      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "rename",\n          "args": {\n            "cols": {\n              "date": "date2",\n              "customer": "customer2"\n            }\n          }\n        },\n        {\n          "function": "with_expressions",\n          "args": {\n            "cols_and_exprs": {\n              "constant": "'just a constant'",\n              "length_customer2": "length(customer2)"\n            }\n          }\n        },\n        {\n          "function": "from_json",\n          "args": {\n            "input_col": "agg_fields",\n            "schema": {\n              "type": "struct",\n              "fields": [\n                {\n                  "name": "field1",\n                  "nullable": true,\n                  "metadata": {},\n                  "type": {\n                    "containsNull": true,\n                    "elementType": "string",\n                    "type": "array"\n                  }\n                },\n                {\n                  "name": "field2",\n                  "type": {\n                    "type": "struct",\n                    "fields": [\n                      {\n                        "name": "field1",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      },\n                      {\n                        "name": "field2",\n                        "type": "string",\n                        "nullable": true,\n                        "metadata": {}\n                      }\n                    ]\n                  },\n                  "nullable": true,\n                  "metadata": {}\n                }\n              ]\n            }\n          }\n        },\n        {\n          "function": "to_json",\n          "args": {\n            "in_cols": [\n              "item",\n              "amount"\n            ],\n            "out_col": "item_amount_json"\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        },\n        {\n          "function": "explode_columns",\n          "args": {\n            "explode_arrays": true,\n            "map_cols_to_explode": [\n              "sample"\n            ]\n          }\n        },\n        {\n          "function": "flatten_schema",\n          "args": {\n            "max_level": 2\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data"\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.full_load", "modulename": "lakehouse_engine_usage.data_loader.full_load", "kind": "module", "doc": "

    Full Load

    \n\n

    This scenario reads CSV data from a path and writes in full to another path with delta lake files.

    \n\n
    Relevant notes
    \n\n\n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "batch",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "inferSchema": true\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "repartitioned_sales",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "repartition",\n          "args": {\n            "num_partitions": 1,\n            "cols": ["date", "customer"]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "overwrite",\n      "data_format": "delta",\n      "partitions": [\n        "date",\n        "customer"\n      ],\n      "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data"\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.read_from_dataframe", "modulename": "lakehouse_engine_usage.data_loader.read_from_dataframe", "kind": "module", "doc": "

    Read from Dataframe

    \n\n
    \n\n
    Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats here.
    \n\n
    \n\n

    Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows:

    \n\n
    \n
    {\n    "input_spec": {\n        "spec_id": "my_df",\n        "read_type": "batch",\n        "data_format": "dataframe",\n        "df_name": df,\n    }\n}\n
    \n
    \n\n
    \n\n
    Why is it relevant?
    \n\n

    With this capability of reading a dataframe you can deal with sources that do not yet officially have a reader (e.g., REST api, XML files, etc.).

    \n\n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_append_load_with_malformed", "modulename": "lakehouse_engine_usage.data_loader.streaming_append_load_with_malformed", "kind": "module", "doc": "

    Streaming Append Load with DROPMALFORMED

    \n\n

    This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "mode": "DROPMALFORMED"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_dropmalformed/data",\n      "schema": {\n        "type": "struct",\n        "fields": [\n          {\n            "name": "salesorder",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "item",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "date",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "customer",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "article",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "amount",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          }\n        ]\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "db_table": "test_db.streaming_dropmalformed_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/checkpoint"\n      },\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/data"\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_append_load_with_terminator", "modulename": "lakehouse_engine_usage.data_loader.streaming_append_load_with_terminator", "kind": "module", "doc": "

    Streaming Append Load with Optimize Dataset Terminator

    \n\n

    This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator here.

    \n\n

    As for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|",\n        "mode": "DROPMALFORMED"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_with_terminators/data",\n      "schema": {\n        "type": "struct",\n        "fields": [\n          {\n            "name": "salesorder",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "item",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "date",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "customer",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "article",\n            "type": "string",\n            "nullable": true,\n            "metadata": {}\n          },\n          {\n            "name": "amount",\n            "type": "integer",\n            "nullable": true,\n            "metadata": {}\n          }\n        ]\n      }\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "sales_source",\n      "write_type": "append",\n      "db_table": "test_db.streaming_with_terminators_table",\n      "data_format": "delta",\n      "partitions": [\n        "date"\n      ],\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/checkpoint"\n      },\n      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/data"\n    }\n  ],\n  "terminate_specs": [\n    {\n      "function": "optimize_dataset",\n      "args": {\n        "db_table": "test_db.streaming_with_terminators_table",\n        "debug": true\n      }\n    }\n  ]\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_delta_load_with_group_and_rank_condensation", "modulename": "lakehouse_engine_usage.data_loader.streaming_delta_load_with_group_and_rank_condensation", "kind": "module", "doc": "

    Streaming Delta Load with Group and Rank Condensation

    \n\n

    This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json",\n      "with_filepath": true,\n      "options": {\n        "mode": "FAILFAST",\n        "header": true,\n        "delimiter": "|"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "sales_bronze_with_extraction_date",\n      "input_id": "sales_bronze",\n      "transformers": [\n        {\n          "function": "with_regex_value",\n          "args": {\n            "input_col": "lhe_extraction_filepath",\n            "output_col": "extraction_date",\n            "drop_input_col": true,\n            "regex": ".*WE_SO_SCL_(\\\\d+).csv"\n          }\n        },\n        {\n          "function": "with_auto_increment_id"\n        },\n        {\n          "function": "group_and_rank",\n          "args": {\n            "group_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key": [\n              "extraction_date",\n              "changed_on",\n              "lhe_row_id"\n            ]\n          }\n        },\n        {\n          "function": "repartition",\n          "args": {\n            "num_partitions": 1\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_silver",\n      "input_id": "sales_bronze_with_extraction_date",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data",\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint"\n      },\n      "with_batch_id": true,\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",\n        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",\n        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"\n      }\n    }\n  ]\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_loader.streaming_delta_with_late_arriving_and_out_of_order_events", "modulename": "lakehouse_engine_usage.data_loader.streaming_delta_with_late_arriving_and_out_of_order_events", "kind": "module", "doc": "

    Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking)

    \n\n

    How to Deal with Late Arriving Data without using Watermark

    \n\n

    This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    {\n  "input_specs": [\n    {\n      "spec_id": "sales_source",\n      "read_type": "streaming",\n      "data_format": "csv",\n      "options": {\n        "header": true,\n        "delimiter": "|"\n      },\n      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data"\n    }\n  ],\n  "transform_specs": [\n    {\n      "spec_id": "transformed_sales_source",\n      "input_id": "sales_source",\n      "transformers": [\n        {\n          "function": "condense_record_mode_cdc",\n          "args": {\n            "business_key": [\n              "salesorder",\n              "item"\n            ],\n            "ranking_key_desc": [\n              "extraction_timestamp",\n              "actrequest_timestamp",\n              "datapakid",\n              "partno",\n              "record"\n            ],\n            "record_mode_col": "recordmode",\n            "valid_record_modes": [\n              "",\n              "N",\n              "R",\n              "D",\n              "X"\n            ]\n          }\n        }\n      ]\n    }\n  ],\n  "output_specs": [\n    {\n      "spec_id": "sales_bronze",\n      "input_id": "transformed_sales_source",\n      "write_type": "merge",\n      "data_format": "delta",\n      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data",\n      "options": {\n        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint"\n      },\n      "merge_opts": {\n        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",\n        "delete_predicate": "new.recordmode in ('R','D','X')",\n        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"\n      }\n    }\n  ],\n  "exec_env": {\n    "spark.sql.streaming.schemaInference": true\n  }\n}\n
    \n
    \n\n
    Relevant notes:
    \n\n\n\n
    \n\n
    Disclaimer! The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data.
    \n\n
    \n\n\n\n
    \n\n
    Documentation
    \n\n

    Feature Deep Dive: Watermarking in Apache Spark Structured Streaming - The Databricks Blog\nStructured Streaming Programming Guide - Spark 3.4.0 Documentation

    \n\n
    \n\n

    How to Deal with Late Arriving Data using Watermark

    \n\n

    When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result.

    \n\n

    Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.

    \n\n
    Approach 1 - Use a pre-defined fixed window (Bad)
    \n\n

    \n\n

    Credits: Image source

    \n\n

    To explain this visually let\u2019s take a scenario where we are receiving data at various times from around 10:50 AM \u2192 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period.

    \n\n

    In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM \u2192 11:10 AM window that closes at 11:10 AM, which does not give the correct result.

    \n\n
    Approach 2 - Watermark
    \n\n

    We can define a watermark that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called watermarking. In the most basic sense, by defining a watermark Spark Structured Streaming then knows when it has ingested all data up to some time, T, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp T.

    \n\n

    \n\n

    Credits: Image source

    \n\n

    Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM \u219211:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once the max event time seen minus the specified watermark is greater than the upper bound of the window.

    \n\n

    In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM \u2192 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark\u2019s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just not emitted. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM \u2192 11:00 AM window can be emitted to the result table.

    \n\n

    This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store.

    \n\n
    Watermarking and Different Output Modes
    \n\n

    It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window.

    \n\n

    Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data \u2013 higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates.

    \n\n

    Watermarks can only be used when you are running your streaming application in append or update output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state.

    \n\n
    Joins With Watermark
    \n\n

    There are three types of stream-stream joins that can be implemented in Structured Streaming: inner, outer, and semi joins. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it.

    \n\n

    To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join.

    \n\n

    Spark has a policy for handling multiple watermark definitions. Spark maintains one global watermark that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data.

    \n\n

    We can change this behaviour by changing spark.sql.streaming.multipleWatermarkPolicy to max; however, this means that data from the slower stream will be dropped.

    \n\n
    State Store Performance Considerations
    \n\n

    As of Spark 3.2, Spark offers RocksDB state store provider.

    \n\n

    If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses.

    \n\n

    In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management).

    \n\n

    To enable the new build-in state store implementation, set spark.sql.streaming.stateStore.providerClass to org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.

    \n\n

    For more details please visit Spark documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation

    \n\n

    You can enable this in your acons, by specifying it as part of the exec_env properties like below:

    \n\n
    \n
    "exec_env": {\n    "spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider"\n}\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.write_and_read_dataframe", "modulename": "lakehouse_engine_usage.data_loader.write_and_read_dataframe", "kind": "module", "doc": "

    Write and Read Dataframe

    \n\n

    DataFrame writer can give us some advantages by returning a dictionary containing the spec_id and the computed dataframe.\nIn these examples we will cover the following scenarios of using the output dataframe format:

    \n\n
      \n
    1. Write to dataframe: Consuming the output spec as DataFrame;
    2. \n
    3. Write all dataframes: Consuming all DataFrames generated per specs;
    4. \n
    5. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data.
    6. \n
    \n\n

    Main advantages of using this output writer:

    \n\n\n\n

    If you want/need, you can add as many dataframes as you want in the output spec\nreferencing the spec_id you want to add.

    \n\n
    \n\n

    This is not intended to replace the other capabilities offered by the\nlakehouse-engine and in case other feature can cover your use case,\nyou should use it instead of using the Dataframe writer, as they\nare much more extensively tested on different type of operations.

    \n\n

    Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.

    \n\n

    Moreover, Dataframe writer is not supported for the streaming trigger\ntypes processing time and continuous.

    \n\n
    \n\n

    1. Write to dataframe: Consuming the output spec as DataFrame

    \n\n

    Silver Dummy Sales Write to DataFrame

    \n\n

    In this example we will cover the Dummy Sales write to a result containing the output DataFrame.

    \n\n\n\n
    \n\n
    If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "dataframe",\n            "options": {\n                "checkpointLocation": "s3://my_data_product_bucket/checkpoints/bronze/dummy_sales",\n            },\n        }\n    ],\n}\n
    \n
    \n\n

    Run the Load and Return the Dictionary with the DataFrames by OutputSpec

    \n\n

    This exploratory test will return a dictionary with the output spec and the dataframe\nthat will be stored after transformations.

    \n\n
    \n
    output = load_data(acon=acon)\ndisplay(output.keys())\ndisplay(output.get("dummy_sales_silver"))\n
    \n
    \n\n

    2. Write all dataframes: Consuming all DataFrames generated per specs

    \n\n

    Silver Dummy Sales Write to DataFrame

    \n\n

    In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame.

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "sales_bronze",\n            "input_id": "dummy_sales_bronze",\n            "data_format": "dataframe",\n        },\n        {\n            "spec_id": "sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "dataframe",\n        },\n    ],\n}\n
    \n
    \n\n\n\n

    This exploratory test will return a dictionary with all specs and the related dataframe.\nYou can access the DataFrame you need by output.get(<spec_id>) for future developments and tests.

    \n\n
    \n
    output = load_data(acon=acon)\ndisplay(output.keys())\ndisplay(output.get("sales_bronze"))\ndisplay(output.get("sales_silver"))\n
    \n
    \n\n

    3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data

    \n\n

    Silver Load Dummy Deliveries

    \n\n

    In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec:

    \n\n\n\n
    \n\n
    This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes.
    \n\n
    \n\n

    Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "read_type": "batch",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        },\n        {\n            "spec_id": "dummy_deliveries_silver_source",\n            "read_type": "batch",\n            "data_format": "delta",\n            "db_table": "my_database.dummy_deliveries",\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_deliveries_table_max_value",\n            "input_id": "dummy_deliveries_silver_source",\n            "transformers": [\n                {\n                    "function": "get_max_value",\n                    "args": {"input_col": "delivery_date", "output_col": "latest"},\n                },\n                {\n                    "function": "with_expressions",\n                    "args": {\n                        "cols_and_exprs": {"latest": "CASE WHEN latest IS NULL THEN 0 ELSE latest END"},\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "deliveries_bronze",\n            "input_id": "dummy_deliveries_bronze",\n            "data_format": "dataframe",\n        },\n        {\n            "spec_id": "dummy_deliveries_transformed",\n            "input_id": "dummy_deliveries_table_max_value",\n            "data_format": "dataframe",\n        },\n    ],\n}\n\ndummy_deliveries_transformed = load_data(acon=acon)\n\ndummy_deliveries_transformed_df = dummy_deliveries_transformed.get("dummy_deliveries_transformed")\ndummy_deliveries_bronze_df = dummy_deliveries_transformed.get("deliveries_bronze")\n
    \n
    \n\n

    Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using just one output because we only need the dataframe from the output for the next step.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"delivery_note_header": "delivery_note", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_bronze_df,\n        },\n        {\n            "spec_id": "dummy_deliveries_table_max_value",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_transformed_df,\n        },\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_deliveries_transform",\n            "input_id": "dummy_deliveries_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n                {\n                    "function": "incremental_filter",\n                    "args": {\n                        "input_col": "delivery_date",\n                        "increment_df": "dummy_deliveries_table_max_value",\n                        "increment_col": "latest",\n                        "greater_or_equal": False,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "input_id": "dummy_deliveries_transform",\n            "data_format": "dataframe",\n        }\n    ],\n}\n\ndummy_deliveries_silver = load_data(acon=acon)\ndummy_deliveries_silver_df = dummy_deliveries_silver.get("dummy_deliveries_silver")\n
    \n
    \n\n

    Write the silver data generated by previous ACON into the target

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nwrite_silver_acon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "read_type": "batch",\n            "data_format": "dataframe",\n            "df_name": dummy_deliveries_silver_df,\n        },\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dummy_deliveries_quality",\n            "input_id": "dummy_deliveries_silver",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "expectations_store_prefix": "dq/expectations/",\n            "validations_store_prefix": "dq/validations/",\n            "data_docs_prefix": "dq/data_docs/site/",\n            "checkpoint_store_prefix": "dq/checkpoints/",\n            "result_sink_db_table": "my_database.dummy_deliveries_dq",\n            "result_sink_location": "my_data_product_bucket/dq/dummy_deliveries",\n            "fail_on_error": False,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {\n                    "function": "expect_column_values_to_not_be_null",\n                    "args": {"column": "delivery_note"},\n                },\n                {\n                    "function": "expect_table_row_count_to_be_between",\n                    "args": {"min_value": 19},\n                },\n                {\n                    "function": "expect_column_max_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 2},\n                },\n            ],\n        },\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_silver",\n            "input_id": "dummy_deliveries_quality",\n            "write_type": "append",\n            "location": "s3://my_data_product_bucket/silver/dummy_deliveries_df_writer",\n            "data_format": "delta",\n        }\n    ],\n    "exec_env": {\n        "spark.databricks.delta.schema.autoMerge.enabled": True,\n        "spark.databricks.delta.optimizeWrite.enabled": True,\n        "spark.databricks.delta.autoCompact.enabled": True,\n    },\n}\n\nload_data(acon=write_silver_acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_loader.write_to_console", "modulename": "lakehouse_engine_usage.data_loader.write_to_console", "kind": "module", "doc": "

    Write to Console

    \n\n

    Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected.

    \n\n

    Silver Dummy Sales Write to Console Example

    \n\n

    In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps:

    \n\n
      \n
    1. Definition of how to read data (input data location, read type and data format);
    2. \n
    3. Transformation of data (rename relevant columns);
    4. \n
    5. Definition of how to print to console (limit, truncate, vertical options);
    6. \n
    \n\n

    For this, the ACON specs are :

    \n\n\n\n
    \n\n
    Writer to console is a wrapper for spark.show() function, if you want to know more about the function itself or the available options, please check the spark documentation here.
    \n\n
    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\ncols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_sales_bronze",\n            "read_type": "streaming",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_sales",\n        }\n    ],\n    "transform_specs": [\n        {\n            "spec_id": "dummy_sales_transform",\n            "input_id": "dummy_sales_bronze",\n            "transformers": [\n                {\n                    "function": "rename",\n                    "args": {\n                        "cols": cols_to_rename,\n                    },\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_sales_silver",\n            "input_id": "dummy_sales_transform",\n            "data_format": "console",\n            "options": {"limit": 8, "truncate": False, "vertical": False},\n        }\n    ],\n}\n
    \n
    \n\n

    And then, Run the Load and Exit the Notebook: This exploratory test will write to the console, which means the final\ndataframe will be displayed.

    \n\n
    \n
    load_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality", "modulename": "lakehouse_engine_usage.data_quality", "kind": "module", "doc": "

    Data Quality

    \n\n

    The Data Quality framework is based on Great Expectations (GX) and other custom-made \ndevelopments, providing a very light abstraction on top of the GX open source framework and the Spark framework.

    \n\n

    How to use Data Quality?

    \n\n

    Data Loader

    \n\n

    You can define data quality rules inside the DataLoader algorithm that you use to load data.

    \n\n
    \n\n

    The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the \ncapability to store DQ results having history over all the DQ executions, which can be used for debugging, \nto create DQ dashboards on top of the data, and much more.

    \n\n
    \n\n

    Examples:\nIn these examples, dummy sales local data is used to cover a few example usages of the DQ Framework\n(based on Great Expectations).\nThe main difference between the sample acons is on the usage of dq_specs.

    \n\n\n\n

    Data Quality Validator

    \n\n

    The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables).\nIn contrast to the dq_specs inside the DataLoader algorithm, the DQValidator focuses on validating data at rest \n(post-mortem) instead of validating data in-transit (before it is loaded to the destination).

    \n\n
    \n\n

    The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the\ncapability to store DQ results having history over all the DQ executions, which can be used for debugging,\nto create DQ dashboards on top of the data, and much more.

    \n\n
    \n\n

    Here you can find more information regarding DQValidator and examples.

    \n\n

    Reconciliator

    \n\n

    Similarly to the Data Quality Validator algorithm, the Reconciliator algorithm focuses on \nvalidating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a \ntruth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or \nTables), instead of executing DQ rules defined by the teams. \nHere you can find more information regarding reconciliator and examples.

    \n\n
    \n\n
    Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available.
    \n\n
    \n\n

    Custom Expectations

    \n\n

    If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you \ncan create a custom expectation to make this verification.

    \n\n
    \n\n

    Before creating a custom expectation check if there is an expectation already created to address your needs, \nboth in Great Expectations and the Lakehouse Engine.\nAny Custom Expectation that is too specific (using hardcoded table/column names) will be rejected.\nExpectations should be generic by definition.

    \n\n
    \n\n

    Here you can find more information regarding custom expectations and examples.

    \n\n

    Row Tagging

    \n\n

    The row tagging strategy allows users to tag the rows that failed to be easier to identify the problems \nin the validations. Here you can find all the details and examples.

    \n\n

    How to check the results of the Data Quality Process?

    \n\n

    1. Table/location analysis

    \n\n

    The possibility to configure a Result Sink allows you to store the history of executions of the DQ process. \nYou can query the table or the location to search through data and analyse history.

    \n\n

    2. Power BI Dashboard

    \n\n

    With the information expanded, interactive analysis can be built on top of the history of the DQ process.\nA dashboard can be created with the results that we have in dq_specs. To be able to have this information you \nneed to use arguments result_sink_db_table and/or result_sink_location.

    \n\n

    Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and \nrun name, and you will have information about the number of runs, some statistics, status of expectations and more. \nAnalysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, \nand others can be made, using the information in the result_sink_db_table/result_sink_location.

    \n\n
    \n\n

    The recommendation is to use the same result sink table/location for all your dq_specs and \nin the dashboard you will get a preview of the status of all of them.

    \n\n
    \n\n

    \n\n

    3. Data Docs Website

    \n\n

    A site that is auto generated to present you all the relevant information can also be used. If you choose to define \nthe parameter data_docs_bucket you will be able to store the GX documentation in the defined bucket,\nand therefore make your data docs available in the DQ Web App (GX UI) visible to everyone. \nThe data_docs_bucket property supersedes the bucket property only for data docs storage.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.custom_expectations", "modulename": "lakehouse_engine_usage.data_quality.custom_expectations", "kind": "module", "doc": "

    Custom Expectations

    \n\n

    Defining Custom Expectations

    \n\n

    Custom expectations are defined in python and need to follow a structure to correctly integrate with Great Expectations.

    \n\n

    Follow the documentation of GX on Creating Custom Expectations \nand find information about the existing types of expectations.

    \n\n

    Here is an example of custom expectation.\nAs for other cases, the acon configuration should be executed with load_data using:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\nacon = {...}\nload_data(acon=acon)\n
    \n
    \n\n

    Example of ACON configuration:

    \n\n
    \n
    """Expectation to check if column 'a' is lower or equal than column 'b'."""\n\nfrom typing import Any, Dict, Optional\n\nfrom great_expectations.core import ExpectationConfiguration\nfrom great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine\nfrom great_expectations.expectations.expectation import ColumnPairMapExpectation\nfrom great_expectations.expectations.metrics.map_metric_provider import (\n    ColumnPairMapMetricProvider,\n    column_pair_condition_partial,\n)\n\nfrom lakehouse_engine.utils.expectations_utils import validate_result\n\n\nclass ColumnPairCustom(ColumnPairMapMetricProvider):\n    """Asserts that column 'A' is lower or equal than column 'B'.\n\n    Additionally, the 'margin' parameter can be used to add a margin to the\n    check between column 'A' and 'B': 'A' <= 'B' + 'margin'.\n    """\n\n    condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"\n    condition_domain_keys = (\n        "batch_id",\n        "table",\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n    )\n    condition_value_keys = ("margin",)\n\n    @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n    def _spark(\n        self: ColumnPairMapMetricProvider,\n        column_A: Any,\n        column_B: Any,\n        margin: Any,\n        **kwargs: dict,\n    ) -> Any:\n        """Implementation of the expectation's logic.\n\n        Args:\n            column_A: Value of the row of column_A.\n            column_B: Value of the row of column_B.\n            margin: margin value to be added to column_b.\n            kwargs: dict with additional parameters.\n\n        Returns:\n            If the condition is met.\n        """\n        if margin is None:\n            approx = 0\n        elif not isinstance(margin, (int, float, complex)):\n            raise TypeError(\n                f"margin must be one of int, float, complex."\n                f" Found: {margin} as {type(margin)}"\n            )\n        else:\n            approx = margin  # type: ignore\n\n        return column_A <= column_B + approx  # type: ignore\n\n\nclass ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n    """Expect values in column A to be lower or equal than column B.\n\n    Args:\n        column_A: The first column name.\n        column_B: The second column name.\n        margin: additional approximation to column B value.\n\n    Keyword Args:\n        - allow_cross_type_comparisons: If True, allow\n            comparisons between types (e.g. integer and string).\n            Otherwise, attempting such comparisons will raise an exception.\n        - ignore_row_if: "both_values_are_missing",\n            "either_value_is_missing", "neither" (default).\n        - result_format: Which output mode to use:\n            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n        - include_config: If True (default), then include the expectation config\n            as part of the result object.\n        - catch_exceptions: If True, then catch exceptions and\n            include them as part of the result object. Default: False.\n        - meta: A JSON-serializable dictionary (nesting allowed)\n            that will be included in the output without modification.\n\n    Returns:\n        An ExpectationSuiteValidationResult.\n    """\n\n    examples = [\n        {\n            "dataset_name": "Test Dataset",\n            "data": [\n                {\n                    "data": {\n                        "a": [11, 22, 50],\n                        "b": [10, 21, 100],\n                        "c": [9, 21, 30],\n                    },\n                    "schemas": {\n                        "spark": {\n                            "a": "IntegerType",\n                            "b": "IntegerType",\n                            "c": "IntegerType",\n                        }\n                    },\n                }\n            ],\n            "tests": [\n                {\n                    "title": "negative_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "c",\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["c"],\n                        },\n                    },\n                    "out": {\n                        "success": False,\n                        "unexpected_index_list": [\n                            {"c": 9, "a": 11},\n                            {"c": 21, "a": 22},\n                            {"c": 30, "a": 50},\n                        ],\n                    },\n                },\n                {\n                    "title": "positive_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "b",\n                        "margin": 1,\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["a"],\n                        },\n                    },\n                    "out": {\n                        "success": True,\n                        "unexpected_index_list": [],\n                    },\n                },\n            ],\n        },\n    ]\n\n    map_metric = "column_pair_values.a_smaller_or_equal_than_b"\n    success_keys = (\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n        "margin",\n        "mostly",\n    )\n    default_kwarg_values = {\n        "mostly": 1.0,\n        "ignore_row_if": "neither",\n        "result_format": "BASIC",\n        "include_config": True,\n        "catch_exceptions": False,\n    }\n\n    def _validate(\n        self,\n        configuration: ExpectationConfiguration,\n        metrics: Dict,\n        runtime_configuration: Optional[dict] = None,\n        execution_engine: Optional[ExecutionEngine] = None,\n    ) -> dict:\n        """Custom implementation of the GE _validate method.\n\n        This method is used on the tests to validate both the result\n        of the tests themselves and if the unexpected index list\n        is correctly generated.\n        The GE test logic does not do this validation, and thus\n        we need to make it manually.\n\n        Args:\n            configuration: Configuration used in the test.\n            metrics: Test result metrics.\n            runtime_configuration: Configuration used when running the expectation.\n            execution_engine: Execution Engine where the expectation was run.\n\n        Returns:\n            Dictionary with the result of the validation.\n        """\n        return validate_result(self, configuration, metrics)\n\n\n"""Mandatory block of code. If it is removed the expectation will not be available."""\nif __name__ == "__main__":\n    # test the custom expectation with the function `print_diagnostic_checklist()`\n    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
    \n
    \n\n

    Naming Conventions

    \n\n

    Your expectation's name should start with expect.

    \n\n

    The name of the file must be the name of the expectation written in snake case. Ex: expect_column_length_match_input_length

    \n\n

    The name of the class must be the name of the expectation written in camel case. Ex: ExpectColumnLengthMatchInputLength

    \n\n

    File Structure

    \n\n

    The file contains two main sections:

    \n\n\n\n

    Metric Definition

    \n\n

    In this section we define the logic of the expectation. This needs to follow a certain structure:

    \n\n

    Code Structure

    \n\n

    1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds \nto your expectation's type. More info on the metric providers.

    \n\n

    2) You need to define the name of your metric. This name must be unique and must follow the following structure: \ntype of expectation.name of metric. Ex.: column_pair_values.a_smaller_or_equal_than_b\nTypes of expectations: column_values, multicolumn_values, column_pair_values, table_rows, table_columns.

    \n\n

    3) Any GX default parameters that are necessary to calculate your metric must be defined as \"condition_domain_keys\".

    \n\n

    4) Any additional parameters that are necessary to calculate your metric must be defined as \"condition_value_keys\".

    \n\n

    5) The logic of your expectation must be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse.

    \n\n
    \n
    1) class ColumnMapMetric(ColumnMapMetricProvider):\n    """Asserts that a column matches a pattern."""\n\n    2) condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"\n    3) condition_domain_keys = (\n        "batch_id",\n        "table",\n        "column_A",\n        "column_B",\n        "ignore_row_if",\n    )\n    4) condition_value_keys = ("margin",)\n\n    5) @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n    def _spark(\n        self: ColumnPairMapMetricProvider,\n        column_A: Any,\n        column_B: Any,\n        margin: Any,\n        **kwargs: dict,\n    ) -> Any:\n        """Implementation of the expectation's logic.\n\n        Args:\n            column_A: Value of the row of column_A.\n            column_B: Value of the row of column_B.\n            margin: margin value to be added to column_b.\n            kwargs: dict with additional parameters.\n\n        Returns:\n            If the condition is met.\n        """\n        if margin is None:\n            approx = 0\n        elif not isinstance(margin, (int, float, complex)):\n            raise TypeError(\n                f"margin must be one of int, float, complex."\n                f" Found: {margin} as {type(margin)}"\n            )\n        else:\n            approx = margin  # type: ignore\n\n        return column_A <= column_B + approx  # type: ignore\n
    \n
    \n\n

    Expectation Definition

    \n\n

    In this section we define the expectation. This needs to follow a certain structure:

    \n\n

    Code Structure

    \n\n

    1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type.

    \n\n

    2) You must define an \"examples\" object where you define at least one success and one failure of your expectation to \ndemonstrate its logic. The result format must be set to complete, and you must set the unexpected_index_name variable.

    \n\n
    \n\n

    For any examples where you will have unexpected results you must define unexpected_index_list in your \"out\" element.\nThis will be validated during the testing phase.

    \n\n
    \n\n

    3) The metric must be the same you defined in the metric definition.

    \n\n

    4) You must define all additional parameters that the user has to/should provide to the expectation.

    \n\n

    5) You should define any default values for your expectations parameters.

    \n\n

    6) You must define the _validate method like shown in the example. You must call the validate_result function \ninside your validate method, this process adds a validation to the unexpected index list in the examples.

    \n\n
    \n\n

    If your custom expectation requires any extra validations, or you require additional fields to be returned on \nthe final dataframe, you can add them in this function. \nThe validate_result method has two optional parameters (partial_success and `partial_result) that can be used to \npass the result of additional validations and add more information to the result key of the returned dict respectively.

    \n\n
    \n\n
    \n
    1) class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n    """Expect values in column A to be lower or equal than column B.\n\n    Args:\n        column_A: The first column name.\n        column_B: The second column name.\n        margin: additional approximation to column B value.\n\n    Keyword Args:\n        allow_cross_type_comparisons: If True, allow\n            comparisons between types (e.g. integer and string).\n            Otherwise, attempting such comparisons will raise an exception.\n        ignore_row_if: "both_values_are_missing",\n            "either_value_is_missing", "neither" (default).\n        result_format: Which output mode to use:\n            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n        include_config: If True (default), then include the expectation config\n            as part of the result object.\n        catch_exceptions: If True, then catch exceptions and\n            include them as part of the result object. Default: False.\n        meta: A JSON-serializable dictionary (nesting allowed)\n            that will be included in the output without modification.\n\n    Returns:\n        An ExpectationSuiteValidationResult.\n    """\n    2) examples = [\n        {\n            "dataset_name": "Test Dataset",\n            "data": {\n                "a": [11, 22, 50],\n                "b": [10, 21, 100],\n                "c": [9, 21, 30],\n            },\n            "schemas": {\n                "spark": {"a": "IntegerType", "b": "IntegerType", "c": "IntegerType"}\n            },\n            "tests": [\n                {\n                    "title": "negative_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "c",\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["c"],\n                            "include_unexpected_rows": True,\n                        },\n                    },\n                    "out": {\n                        "success": False,\n                        "unexpected_index_list": [\n                            {"c": 9, "a": 11},\n                            {"c": 21, "a": 22},\n                            {"c": 30, "a": 50},\n                        ],\n                    },\n                },\n                {\n                    "title": "positive_test",\n                    "exact_match_out": False,\n                    "include_in_gallery": True,\n                    "in": {\n                        "column_A": "a",\n                        "column_B": "b",\n                        "margin": 1,\n                        "result_format": {\n                            "result_format": "COMPLETE",\n                            "unexpected_index_column_names": ["a"],\n                        },\n                    },\n                    "out": {"success": True},\n                },\n            ],\n        },\n    ]\n\n    3) map_metric = "column_values.pattern_match"\n    4) success_keys = (\n        "validation_regex",\n        "mostly",\n    )\n    5) default_kwarg_values = {\n        "ignore_row_if": "never",\n        "result_format": "BASIC",\n        "include_config": True,\n        "catch_exceptions": False,\n        "mostly": 1,\n    }\n\n    6) def _validate(\n        self,\n        configuration: ExpectationConfiguration,\n        metrics: Dict,\n        runtime_configuration: Optional[dict] = None,\n        execution_engine: Optional[ExecutionEngine] = None,\n    ) -> dict:\n        """Custom implementation of the GX _validate method.\n\n        This method is used on the tests to validate both the result\n        of the tests themselves and if the unexpected index list\n        is correctly generated.\n        The GX test logic does not do this validation, and thus\n        we need to make it manually.\n\n        Args:\n            configuration: Configuration used in the test.\n            metrics: Test result metrics.\n            runtime_configuration: Configuration used when running the expectation.\n            execution_engine: Execution Engine where the expectation was run.\n\n        Returns:\n            Dictionary with the result of the validation.\n        """\n        return validate_result(self, configuration, metrics)\n
    \n
    \n\n

    Printing the Expectation Diagnostics

    \n\n

    Your expectations must include the ability to call the Great Expectations diagnostic function in order to be validated.

    \n\n

    In order to do this code must be present.

    \n\n
    \n
    """Mandatory block of code. If it is removed the expectation will not be available."""\nif __name__ == "__main__":\n    # test the custom expectation with the function `print_diagnostic_checklist()`\n    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
    \n
    \n\n

    Creation Process

    \n\n

    1) Create a branch from lakehouse engine.

    \n\n

    2) Create a custom expectation with your specific logic:

    \n\n
      \n
    1. All new expectations must be placed inside folder /lakehouse_engine/dq_processors/custom_expectations.
    2. \n
    3. The name of the expectation must be added to the file /lakehouse_engine/core/definitions.py, to the variable: CUSTOM_EXPECTATION_LIST.
    4. \n
    5. All new expectations must be tested on /tests/feature/custom_expectations/test_custom_expectations.py.\nIn order to create a new test for your custom expectation it is necessary to:
    6. \n
    \n\n\n\n

    3) When the development is completed, create a pull request with your changes.

    \n\n

    4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. \nThis means that you need to upgrade your version of the lakehouse engine in order to use it.

    \n\n

    Usage

    \n\n

    Custom Expectations are available to use like any other expectations provided by Great Expectations.

    \n\n

    Parameters

    \n\n

    Depending on the type of expectation you are defining some parameters are expected by default. \nEx: A ColumnMapExpectation has a default \"column\" parameter.

    \n\n

    Mostly

    \n\n

    Mostly is a standard \nparameter for a subset of expectations that is used to define a threshold for the failure of an expectation. \nEx: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have \na negative result.

    \n\n

    Result Format

    \n\n

    Great Expectations has several different types of result formats \nfor the expectations results. The lakehouse engine requires the result format to be set to \"COMPLETE\" in order to tag \nthe lines where the expectations failed.

    \n\n

    unexpected_index_column_names

    \n\n

    Inside this key you must define what columns are used as an index inside your data. If this is set and the result \nformat is set to \"COMPLETE\" a list with the indexes of the lines that failed the validation will be returned by \nGreat Expectations.\nThis information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests \ninside the _validate method verify that the custom expectation is tagging these lines correctly.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.data_quality_validator", "modulename": "lakehouse_engine_usage.data_quality.data_quality_validator", "kind": "module", "doc": "

    Data Quality Validator

    \n\n

    DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations).\nWith this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process,\nusing Great Expectations functions directly into a specific dataset also\nmaking use of all the InputSpecs available in the engine.

    \n\n

    Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality.

    \n\n
    \n\n

    This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ\nprocess raises any exception. Please use it carefully!! You may lose important commits and data. Moreover, this will\nhighly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data\nQuality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored\nto the previous version, but the error could have happened 4 or 5 versions before.

    \n\n
    \n\n

    When to use?

    \n\n\n\n

    This algorithm also gives teams some freedom to:

    \n\n\n\n

    How to use?

    \n\n

    All of these configurations are passed via the ACON to instantiate\na DQValidatorSpec object. The DQValidator algorithm uses an\nACON to configure its execution. In DQValidatorSpec you can\nfind the meaning of each ACON property.

    \n\n

    Here is an example of ACON configuration:

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_spec": {\n        "spec_id": "sales_source",\n        "read_type": "batch",\n        "data_format": "table",\n        "db_table": "my_database.my_table"\n    },\n    "dq_spec": {\n        "spec_id": "dq_sales",\n        "input_id": "sales_source",\n        "dq_type": "validator",\n        "store_backend": "file_system",\n        "local_fs_root_dir": "/app/tests/lakehouse/in/feature/dq_validator/dq",\n        "result_sink_db_table": "my_database.dq_validator",\n        "result_sink_format": "json",\n        "fail_on_error": False,\n        "dq_functions": [\n            {"function": "expect_column_to_exist", "args": {"column": "article"}},\n            {\n                "function": "expect_table_row_count_to_be_between",\n                "args": {"min_value": 3, "max_value": 11},\n            },\n        ],\n    },\n    "restore_prev_version": True,\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    On this page you will also find the following examples of usage:

    \n\n
      \n
    1. Dataframe as input & Success on the DQ Validation
    2. \n
    3. Table as input & Failure on DQ Validation & Restore previous version
    4. \n
    5. Files as input & Failure on DQ Validation & Fail_on_error disabled
    6. \n
    7. Files as input & Failure on DQ Validation & Critical functions defined
    8. \n
    9. Files as input & Failure on DQ Validation & Max failure percentage defined
    10. \n
    \n\n

    Example 1 : Dataframe as input & Success on the DQ Validation

    \n\n

    This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new\nDataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and\nsome DQ Validations are applied on top of this dataframe.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\ninput_df = spark.sql("""\n        SELECT a.*, b.article_category, b.article_color\n        FROM my_database.dummy_deliveries a\n        JOIN my_database.dummy_pd_article b\n            ON a.article_id = b.article_id\n        """\n)\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_article_input",\n        "read_type": "batch",\n        "data_format": "dataframe",\n        "df_name": input_df,\n    },\n    "dq_spec": {\n        "spec_id": "deliveries_article_dq",\n        "input_id": "deliveries_article_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "result_sink_db_table": "my_database.dq_validator_deliveries",\n        "result_sink_location": "my_dq_path/dq_validator/dq_validator_deliveries/",\n        "expectations_store_prefix": "dq/dq_validator/expectations/",\n        "validations_store_prefix": "dq/dq_validator/validations/",\n        "data_docs_prefix": "dq/dq_validator/data_docs/site/",\n        "checkpoint_store_prefix": "dq/dq_validator/checkpoints/",\n        "unexpected_rows_pk": ["salesorder", "delivery_item", "article_id"],\n        "dq_functions": [{"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 2: Table as input & Failure on DQ Validation & Restore previous version

    \n\n

    In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version.

    \n\n
    \n\n

    Be careful when using the feature of restoring a previous version of a delta table or delta files. You may\nlose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality\nvalidations. If you run your data loads daily and Data Quality validations weekly, and you define the\nrestore_prev_version to true, this means that the table will be restored to the previous version, but the error\ncould have happened 4 or 5 versions before (because loads are daily, validations are weekly).

    \n\n
    \n\n

    Steps followed in this example to show how the restore_prev_version feature works.

    \n\n
      \n
    1. Insert rows into the dummy_deliveries table to adjust the total numbers of rows and make the DQ process fail.
    2. \n
    3. Use the \"DESCRIBE HISTORY\" statement to check the number of versions available on the table and check the version\nnumber resulting from the insertion to the table.
    4. \n
    5. Execute the DQ Validation, using the configured acon (based on reading the dummy_deliveries table and setting the \nrestore_prev_version to true). Checking the logs of the process, you can see that the data did not pass all the \nexpectations defined and that the table version restore process was triggered.
    6. \n
    7. Re-run a \"DESCRIBE HISTORY\" statement to check that the previous version of the table was restored and thus, the row inserted in the beginning of the process is no longer present in the table.
    8. \n
    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\n# Force failure of data quality by adding new row\nspark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""")\n\n\n# Check history of the table\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "read_type": "batch",\n        "db_table": "my_database.dummy_deliveries",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 19}},\n        ],\n    },\n    "restore_prev_version": True,\n}\n\nexecute_dq_validation(acon=acon)\n\n# Check that the previous version of the table was restored\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n
    \n
    \n\n

    Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail, however disabling the \"fail_on_error\" configuration,\nso the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": False,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 4: Files as input & Failure on DQ Validation & Critical functions defined

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error\nif any of the functions fails.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": True,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n        ],\n        "critical_functions": [\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined

    \n\n

    In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the max_percentage_failure,\nwhich will throw an error if the percentage of failures surpasses the defined maximum threshold.

    \n\n
    \n
    from lakehouse_engine.engine import execute_dq_validation\n\nacon = {\n    "input_spec": {\n        "spec_id": "deliveries_input",\n        "data_format": "delta",\n        "read_type": "streaming",\n        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",\n    },\n    "dq_spec": {\n        "spec_id": "dq_deliveries",\n        "input_id": "deliveries_input",\n        "dq_type": "validator",\n        "bucket": "my_data_product_bucket",\n        "data_docs_bucket": "my_dq_data_docs_bucket",\n        "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n        "tbl_to_derive_pk": "my_database.dummy_deliveries",\n        "fail_on_error": True,\n        "dq_functions": [\n            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},\n            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},\n        ],\n        "max_percentage_failure": 0.2,\n    },\n    "restore_prev_version": False,\n}\n\nexecute_dq_validation(acon=acon)\n
    \n
    \n\n

    Limitations

    \n\n

    Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself.

    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.minimal_example", "modulename": "lakehouse_engine_usage.data_quality.minimal_example", "kind": "module", "doc": "

    Minimal Example

    \n\n

    This scenario illustrates the minimal configuration that you can have to use dq_specs, in which\nit uses required parameters: spec_id, input_id, dq_type, bucket, dq_functions and the optional\nparameter data_docs_bucket. This parameter allows you to store the GX documentation in another\nbucket that can be used to make your data docs available, in DQ Web App (GX UI), without giving users access to your bucket.\nThedata_docs_bucket property supersedes the bucket property only for data docs storage.

    \n\n

    Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.result_sink", "modulename": "lakehouse_engine_usage.data_quality.result_sink", "kind": "module", "doc": "

    Result Sink

    \n\n

    These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining\nthe specific table and location (result_sink_db_table and result_sink_location) where the results\nare expected to be stored. With this configuration, people can, later on, check the history of the DQ\nexecutions using the configured table/location, as shown bellow. You can configure saving the output of the\nresults in the result sink following two approaches:

    \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n\n\n
    ...sourcecolumnmax_valuemin_valueexpectation_typeexpectation_successobserved_valuerun_time_year...
    all columns from raw + moredeliveriessalesordernullnullexpect_column_to_existTRUEnull2023...
    all columns from raw + moredeliveriesnullnullnullexpect_table_row_count_to_be_betweenTRUE232023...
    all columns from raw + moredeliveriesnullnullnullexpect_table_column_count_to_be_betweenTRUE62023...
    \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n\n\n
    checkpoint_configrun_namerun_timerun_resultssuccessvalidation_result_identifierspec_idinput_id
    entire configuration20230323-...-dq_validation2023-03-23T15:11:32.225354+00:00results of the 3 expectationstrue/false for the runidentifierspec_idinput_id
    \n\n
    \n\n\n\n
    \n\n

    1. Result Sink Exploded (Recommended)

    \n\n

    This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink,\nin a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and\nby keyword arguments. This is the recommended approach since it makes the analysis on top of the result\nsink way easier and faster.

    \n\n

    For achieving the exploded data model, this scenario introduces the parameter result_sink_explode, which\nis a flag to determine if the output table/location should have the columns exploded (as True) or\nnot (as False). Default: True, but it is still provided explicitly in this scenario for demo purposes.\nThe table/location will include a schema which contains general columns, statistic columns, arguments of\nexpectations, and others, thus part of the schema will be always with values and other part will depend on\nthe expectations chosen.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "result_sink_explode": True,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_success",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    To check the history of the DQ results, you can run commands like:

    \n\n\n\n

    2. Raw Result Sink

    \n\n

    This scenario is very similar to the previous one, but it changes the parameter result_sink_explode to False so that\nit produces a raw result sink output containing only one row representing the full run of dq_specs (no\nmatter the amount of expectations/dq_functions defined there). Being a raw output, it is not a\nrecommended approach, as it will be more complicated to analyse and make queries on top of it.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink_raw",\n            "result_sink_location": "my_dq_path/dq_result_sink_raw/",\n            "result_sink_explode": False,\n            "tbl_to_derive_pk": "{{ configs.database }}.dummy_deliveries",\n            "source": "deliveries_success_raw",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    To check the history of the DQ results, you can run commands like:

    \n\n\n"}, {"fullname": "lakehouse_engine_usage.data_quality.row_tagging", "modulename": "lakehouse_engine_usage.data_quality.row_tagging", "kind": "module", "doc": "

    Row Tagging

    \n\n

    Data quality is essential for any organisation that relies on data to make informed decisions. \nHigh-quality data provides accurate, reliable, and timely information that enables organisations to identify\nopportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect\nconclusions, faulty decisions, and wasted resources.

    \n\n

    There are several common issues that can compromise data quality, such as:

    \n\n\n\n

    Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for \naccuracy and completeness is key for any organisation.

    \n\n

    One of these controls that can be applied is the DQ Row Tagging Strategy so that you not only apply validations on \nyour data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations \nproviding advantages like:

    \n\n\n\n
    \n\n

    When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning \nthat all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality\nissues it is having.

    \n\n
    \n\n

    Different Types of Expectations:

    \n\n\n\n

    The expectations highlighted as row level will be the ones enabling to Tag failures on specific rows and adding \nthe details about each failure (they affect the field run_row_result inside dq_validations). The expectations \nwith other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag\nspecific rows (they affect the field run_success only, so you can even have situations for which you get \nrun_success False and run_row_success True for all rows).

    \n\n

    How does the Strategy work?

    \n\n

    The strategy relies mostly on the 6 below arguments.

    \n\n
    \n\n

    When you specify \"tag_source_data\": True the arguments fail_on_error, gx_result_format and \nresult_sink_explode are set to the expected values.

    \n\n
    \n\n\n\n
    \n\n

    It only works if result_sink_explode is True, result_format is COMPLETE and \nfail_on_error is `False.

    \n\n
    \n\n\n\n
    \n\n

    It is mandatory to provide one of the arguments (unexpected_rows_pk or tbl_to_derive_pk) when using \ntag_source_data as True. \nWhen tag_source_data is False, this is not mandatory, but still recommended.

    \n\n
    \n\n

    \n\n
    \n\n

    The tagging strategy only works when tag_source_data is True, which automatically\nassigns the expected values for the parameters result_sink_explode (True), fail_on_error (False)\nand gx_result_format (\"COMPLETE\").

    \n\n
    \n\n
    \n\n

    For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, \nyou will also need to add the dq_validations field into your table (your DDL statements, recommended) or \nenable schema evolution.

    \n\n
    \n\n
    \n\n

    Kwargs field is a string, because it can assume different schemas for different expectations and runs. \nIt is useful to provide the complete picture of the row level failure and to allow filtering/joining with \nthe result sink table, when there is one. Some examples of kwargs bellow:

    \n\n\n\n
    \n\n

    Example

    \n\n

    This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to\nidentify the problems in the validations.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket }}",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "tag_source_data": True,\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_tag",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n                {\n                    "function": "expect_column_values_to_be_in_set",\n                    "args": {"column": "salesorder", "value_set": ["37"]},\n                },\n                {\n                    "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b",\n                    "args": {"column_A": "salesorder", "column_B": "delivery_item"},\n                },\n                {\n                    "function": "expect_multicolumn_sum_to_equal",\n                    "args": {"column_list": ["salesorder", "delivery_item"], "sum_total": 100},\n                },\n            ],\n            "critical_functions": [\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 6}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    Running bellow cell shows the new column created, named dq_validations with information about DQ validations.\ndisplay(spark.read.format(\"delta\").load(\"s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/\"))

    \n\n

    Performance and Limitations Trade-offs

    \n\n

    When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format \"Complete\" with \nUnexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all \nthe distinct values for the primary key. After getting all the failures, we are applying some needed transformations \nand joining them with the source data, so that it can be tagged by filling the \"dq_validations\" column.

    \n\n

    Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage \nyou can cache the dataframe by passing the \"cache_df\": True in your DQ Specs. In addition to this, always have in \nmind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your \ndata loads, so always balance performance vs amount of validations that you need.

    \n\n

    Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and \nreturn/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations \nmight raise exceptions.

    \n\n

    On these situations, the data load will still happen and the data will still be tagged with the Data Quality \nvalidations information, however you won't have the complete picture of the failures, so the raised_exceptions \nfield is filled as True, so that you can easily notice it and debug it.

    \n\n

    Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong \nand want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will \nnot want your consumers to be consuming a million of defective rows). However, if you still want to try to make it \npass, you can try to increase your driver and play with some spark configurations like:

    \n\n\n\n

    For debugging purposes, you can also use a different Great Expectations Result Format like \"SUMMARY\" (adding in your DQ Spec\n\"gx_result_format\": \"SUMMARY\"), so that you get only a partial list of the failures, avoiding surpassing the driver\ncapacity.

    \n\n
    \n\n

    When using a Result Format different from the default (\"COMPLETE\"), the flag \"tag_source_data\" will be \noverwritten to False, as the results of the tagging wouldn't be complete which could lead to erroneous \nconclusions from stakeholders (but you can always get the details about the result of the DQ execution in\nthe result_sink_location or result_sink_db_table that you have configured).

    \n\n
    \n"}, {"fullname": "lakehouse_engine_usage.data_quality.validations_failing", "modulename": "lakehouse_engine_usage.data_quality.validations_failing", "kind": "module", "doc": "

    Validations Failing

    \n\n

    The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations.\nThe logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why.

    \n\n

    1. Fail on Error

    \n\n

    In this scenario is specified below two parameters:

    \n\n\n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "source": "deliveries_fail",\n            "fail_on_error": False,\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 20}},\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},\n                {"function": "expect_column_values_to_be_null", "args": {"column": "article"}},\n                {"function": "expect_column_values_to_be_unique", "args": {"column": "status"}},\n                {\n                    "function": "expect_column_min_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 1, "max_value": 15},\n                },\n                {\n                    "function": "expect_column_max_to_be_between",\n                    "args": {"column": "delivery_item", "min_value": 15, "max_value": 30},\n                },\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n\n

    If you run bellow command, you would be able to see the success column has the value false\nfor the last execution.\ndisplay(spark.table(RENDER_UTILS.render_content(\"my_database.dq_result_sink\")))

    \n\n

    2. Critical Functions

    \n\n

    In this scenario, alternative parameters to fail_on_error are used:

    \n\n\n\n

    Additionally, it can also be defined additional parameters like:

    \n\n\n\n

    You can also pair critical_functions with max_percentage_failure by defining something like\na 0.6 max percentage of failure and also defining some critical function.\nIn this case even if the threshold is respected, the list defined on critical_functions still is checked.

    \n\n
    \n
    from lakehouse_engine.engine import load_data\n\nacon = {\n    "input_specs": [\n        {\n            "spec_id": "dummy_deliveries_source",\n            "read_type": "batch",\n            "data_format": "csv",\n            "options": {\n                "header": True,\n                "delimiter": "|",\n                "inferSchema": True,\n            },\n            "location": "s3://my_data_product_bucket/dummy_deliveries/",\n        }\n    ],\n    "dq_specs": [\n        {\n            "spec_id": "dq_validator",\n            "input_id": "dummy_deliveries_source",\n            "dq_type": "validator",\n            "bucket": "my_data_product_bucket",\n            "data_docs_bucket": "my_dq_data_docs_bucket",\n            "data_docs_prefix": "dq/my_data_product/data_docs/site/",\n            "result_sink_db_table": "my_database.dq_result_sink",\n            "result_sink_location": "my_dq_path/dq_result_sink/",\n            "source": "deliveries_critical",\n            "tbl_to_derive_pk": "my_database.dummy_deliveries",\n            "dq_functions": [\n                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},\n                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},\n            ],\n            "critical_functions": [\n                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},\n            ],\n        }\n    ],\n    "output_specs": [\n        {\n            "spec_id": "dummy_deliveries_bronze",\n            "input_id": "dq_validator",\n            "write_type": "overwrite",\n            "data_format": "delta",\n            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",\n        }\n    ],\n}\n\nload_data(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.reconciliator", "modulename": "lakehouse_engine_usage.reconciliator", "kind": "module", "doc": "

    Reconciliator

    \n\n

    Checking if data reconciles, using this algorithm, is a matter of reading the truth data and the current data.\nYou can use any input specification compatible with the lakehouse engine to read truth or current data. On top\nof that, you can pass a truth_preprocess_query and a current_preprocess_query so you can preprocess the data before\nit goes into the actual reconciliation process. The reconciliation process is focused on joining truth\nwith current by all provided columns except the ones passed as metrics.

    \n\n

    In the table below, we present how a simple reconciliation would look like:

    \n\n\n\n\n \n \n \n \n \n \n \n \n \n\n\n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n\n\n
    current_countrycurrent_counttruth_countrytruth_countabsolute_diffperc_diffyellowredrecon_type
    Sweden123Sweden12030.0250.10.2percentage
    Germany2946Sweden2946000.10.2percentage
    France2901France2901000.10.2percentage
    Belgium426Belgium42510.0020.10.2percentage
    \n\n

    The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property\nin ReconciliatorSpec object.

    \n\n

    Below there is an example of usage of reconciliator.

    \n\n
    \n
    from lakehouse_engine.engine import execute_reconciliation\n\ntruth_query = """\n  SELECT\n    shipping_city,\n    sum(sales_order_qty) as qty,\n    order_date_header\n  FROM (\n    SELECT\n      ROW_NUMBER() OVER (\n        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n        ORDER BY changed_on desc\n      ) as rank1,\n      sales_order_header,\n      sales_order_item,\n      sales_order_qty,\n      order_date_header,\n      shipping_city\n    FROM truth -- truth is a locally accessible temp view created by the lakehouse engine\n    WHERE order_date_header = '2021-10-01'\n  ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\ncurrent_query = """\n  SELECT\n    shipping_city,\n    sum(sales_order_qty) as qty,\n    order_date_header\n  FROM (\n    SELECT\n      ROW_NUMBER() OVER (\n        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n        ORDER BY changed_on desc\n      ) as rank1,\n      sales_order_header,\n      sales_order_item,\n      sales_order_qty,\n      order_date_header,\n      shipping_city\n    FROM current -- current is a locally accessible temp view created by the lakehouse engine\n    WHERE order_date_header = '2021-10-01'\n  ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\nacon = {\n    "metrics": [{"metric": "qty", "type": "percentage", "aggregation": "avg", "yellow": 0.05, "red": 0.1}],\n    "truth_input_spec": {\n        "spec_id": "truth",\n        "read_type": "batch",\n        "data_format": "csv",\n        "schema_path": "s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json",\n        "options": {\n            "delimiter": "^",\n            "dateFormat": "yyyyMMdd",\n        },\n        "location": "s3://my_data_product_bucket/bronze/orders",\n    },\n    "truth_preprocess_query": truth_query,\n    "current_input_spec": {\n        "spec_id": "current",\n        "read_type": "batch",\n        "data_format": "delta",\n        "db_table": "my_database.orders",\n    },\n    "current_preprocess_query": current_query,\n}\n\nexecute_reconciliation(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor", "modulename": "lakehouse_engine_usage.sensor", "kind": "module", "doc": "

    Sensor

    \n\n

    What is it?

    \n\n

    The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small\nsingle-node clusters to check if an upstream system or data product contains new data since the last execution of our\njob. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new\ndata, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,\nTransformation, and Loading).

    \n\n

    How do Sensor-based jobs work?

    \n\n

    \"image\"

    \n\n

    With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source\nsystem) have new data since the last successful job. We accomplish this through the approach illustrated above, which\ncan be interpreted as follows:

    \n\n
      \n
    1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
    2. \n
    3. The Sensor task may run in a very tiny single-node cluster to ensure cost\nefficiency (check sensor cost efficiency);
    4. \n
    5. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster\nto process all the ETL tasks (data processing tasks).
    6. \n
    7. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:\n
        \n
      1. (Preferred) Sense the upstream Data Product sensor control delta table;
      2. \n
      3. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table\nsensor);
      4. \n
    8. \n
    \n\n

    The Structure and Relevance of the Data Product\u2019s Sensors Control Table

    \n\n

    The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses\nto opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that\ndata product. You can refer to the below table to understand the sensor delta table structure:

    \n\n\n\n\n \n \n \n\n\n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n \n \n \n\n\n
    Column NameTypeDescription
    sensor_idSTRINGA unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.
    Each sensor in each job should have a different sensor_id.
    If you attempt to create 2 sensors with the same sensor_id, the engine will fail.
    assetsARRAY<STRING>A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status PROCESSED_NEW_DATA.
    statusSTRINGStatus of the sensor. Can either be:
    • ACQUIRED_NEW_DATA \u2013 when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
    • PROCESSED_NEW_DATA - when the job where the sensor is located has processed all the tasks in that job.
    status_change_timestampSTRINGTimestamp when the status has changed for the last time.
    checkpoint_locationSTRINGBase location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.
    upstream_keySTRINGUpstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).
    This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).
    upstream_valueSTRINGUpstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key.
    This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database).
    \n\n

    Note: to make use of the sensors you will need to add this table to your data product.

    \n\n

    How is it different from scheduled jobs?

    \n\n

    Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient\nthan ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new\ndata.

    \n\n

    Are sensor-based jobs cost-efficient?

    \n\n

    For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a very tiny single-node cluster, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient.\nMoreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data.

    \n\n

    Sensor Steps

    \n\n
      \n
    1. Create your sensor task for the upstream source. Examples of available sources:\n
    2. \n
    3. Setup/Execute your ETL task based in the Sensor Condition
    4. \n
    5. Update the Sensor Control table status with the Update Sensor Status
    6. \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.delta_table", "modulename": "lakehouse_engine_usage.sensor.delta_table", "kind": "module", "doc": "

    Sensor from Delta Table

    \n\n

    This shows how to create a Sensor to detect new data from a Delta Table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as sensor_new_data.
    \n\n

    If you want to view some examples of usage you can visit the delta upstream sensor table or the jdbc sensor.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data will be consumed from a delta table in streaming mode,\nso if there is any new data it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "delta",\n        "db_table": "upstream_database.source_delta_table",\n        "options": {\n            "readChangeFeed": "true", # to read changes in upstream table\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it \nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.delta_upstream_sensor_table", "modulename": "lakehouse_engine_usage.sensor.delta_upstream_sensor_table", "kind": "module", "doc": "

    Sensor from other Sensor Delta Table

    \n\n

    This shows how to create a Sensor to detect new data from another Sensor Delta Table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    It makes use of generate_sensor_query to generate the preprocess_query,\ndifferent from delta_table.

    \n\n

    Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger \nthe condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "delta",\n        "db_table": "upstream_database.lakehouse_engine_sensors",\n        "options": {\n            "readChangeFeed": "true",\n        },\n    },\n    "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"),\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.file", "modulename": "lakehouse_engine_usage.sensor.file", "kind": "module", "doc": "

    Sensor from Files

    \n\n

    This shows how to create a Sensor to detect new data from a File Location.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, \nit will automatically trigger the proceeding task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "csv",  # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"\n        "location": "s3://my_data_product_bucket/path",\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.jdbc_table", "modulename": "lakehouse_engine_usage.sensor.jdbc_table", "kind": "module", "doc": "

    Sensor from JDBC

    \n\n

    This shows how to create a Sensor to detect new data from a JDBC table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. Generic JDBC template with fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. Generic JDBC template with fail_on_empty_result=False.
    4. \n
    \n\n

    Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "batch",\n        "data_format": "jdbc",\n        "jdbc_args": {\n            "url": "JDBC_URL",\n            "table": "JDBC_DB_TABLE",\n            "properties": {\n                "user": "JDBC_USERNAME",\n                "password": "JDBC_PWD",\n                "driver": "JDBC_DRIVER",\n            },\n        },\n        "options": {\n            "compress": True,\n        },\n    },\n    "preprocess_query": generate_sensor_query(\n        sensor_id="MY_SENSOR_ID",\n        filter_exp="?upstream_key > '?upstream_value'",\n        control_db_table_name="my_database.lakehouse_engine_sensors",\n        upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",\n    ),\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.kafka", "modulename": "lakehouse_engine_usage.sensor.kafka", "kind": "module", "doc": "

    Sensor from Kafka

    \n\n

    This shows how to create a Sensor to detect new data from Kafka.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "streaming",\n        "data_format": "kafka",\n        "options": {\n            "kafka.bootstrap.servers": "KAFKA_SERVER",\n            "subscribe": "KAFKA_TOPIC",\n            "startingOffsets": "earliest",\n            "kafka.security.protocol": "SSL",\n            "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION",\n            "kafka.ssl.truststore.password": "TRUSTSTORE_PWD",\n            "kafka.ssl.keystore.location": "KEYSTORE_LOCATION",\n            "kafka.ssl.keystore.password": "KEYSTORE_PWD",\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.sap_bw_b4", "modulename": "lakehouse_engine_usage.sensor.sap_bw_b4", "kind": "module", "doc": "

    Sensor from SAP

    \n\n

    This shows how to create a Sensor to detect new data from a SAP LOGCHAIN table.

    \n\n

    Configuration required to have a Sensor

    \n\n\n\n
    \n\n
    This parameter is only needed when the upstream data have to be filtered,
    \n\n

    in this case a custom query should be created with the source table as sensor_new_data.

    \n\n
    \n\n\n\n

    Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.\nThe Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:\ngenerate_sensor_sap_logchain_query and generate_sensor_query.

    \n\n\n\n

    If you want to know more please visit the definition of the class here.

    \n\n

    Scenarios

    \n\n

    This covers the following scenarios of using the Sensor:

    \n\n
      \n
    1. The fail_on_empty_result=True (the default and SUGGESTED behaviour).
    2. \n
    3. The fail_on_empty_result=False.
    4. \n
    \n\n

    Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

    \n\n

    fail_on_empty_result as True (default and SUGGESTED)

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query\n\nacon = {\n    "sensor_id": "MY_SENSOR_ID",\n    "assets": ["MY_SENSOR_ASSETS"],\n    "control_db_table_name": "my_database.lakehouse_engine_sensors",\n    "input_spec": {\n        "spec_id": "sensor_upstream",\n        "read_type": "batch",\n        "data_format": "jdbc",\n        "options": {\n            "compress": True,\n            "driver": "JDBC_DRIVER",\n            "url": "JDBC_URL",\n            "user": "JDBC_USERNAME",\n            "password": "JDBC_PWD",\n            "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"),\n            "query": generate_sensor_query(\n                sensor_id="MY_SENSOR_ID",\n                filter_exp="?upstream_key > '?upstream_value'",\n                control_db_table_name="my_database.lakehouse_engine_sensors",\n                upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",\n            ),\n        },\n    },\n    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",\n    "fail_on_empty_result": True,\n}\n\nexecute_sensor(acon=acon)\n
    \n
    \n\n

    fail_on_empty_result as False

    \n\n

    Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.

    \n\n
    \n
    from lakehouse_engine.engine import execute_sensor\n\nacon = {\n    [...],\n    "fail_on_empty_result": False\n}\n\nacquired_data = execute_sensor(acon=acon)\n
    \n
    \n"}, {"fullname": "lakehouse_engine_usage.sensor.update_sensor_status", "modulename": "lakehouse_engine_usage.sensor.update_sensor_status", "kind": "module", "doc": "

    Update Sensor control delta table after processing the data

    \n\n

    This shows how to update the status of your Sensor after processing the new data.

    \n\n

    Here is an example on how to update the status of your sensor in the Sensors Control Table:

    \n\n
    \n
    from lakehouse_engine.engine import update_sensor_status\n\nupdate_sensor_status(\n    sensor_id="MY_SENSOR_ID",\n    control_db_table_name="my_database.lakehouse_engine_sensors",\n    status="PROCESSED_NEW_DATA",\n    assets=["MY_SENSOR_ASSETS"]\n)\n
    \n
    \n\n

    If you want to know more please visit the definition of the class here.

    \n"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough.