diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..5071bec --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,14 @@ +ignore: + - setup.py +coverage: + status: + patch: + default: + target: 0% + threshold: 5% + if_ci_failed: error + project: + default: + target: 0% + threshold: 5% + if_ci_failed: error diff --git a/.github/workflows/ci-feature.yml b/.github/workflows/ci-feature.yml index 8f2365a..96bba0b 100644 --- a/.github/workflows/ci-feature.yml +++ b/.github/workflows/ci-feature.yml @@ -23,11 +23,43 @@ jobs: - name: Terraform Validate uses: dflook/terraform-validate@v1 + ci-python: + name: ci-python + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Python Install + uses: actions/setup-python@v3 + with: + python-version: '3.10' + + - name: Project Requirements + run: | + python -m pip install --upgrade pip + python -m pip install -r ./requirements/dev.txt + + - name: Linter with flake8 + run: + flake8 . --ignore E501 + + - name: Unit Test with pytest + run: | + python3 -m pytest -vv --color=yes --cov=./ --cov-report=xml + + - name: Test Coverage with codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + open-pr-to-main: name: open-pr-to-main runs-on: ubuntu-latest needs: - ci-terraform + - ci-python steps: - name: Checkout diff --git a/.github/workflows/ci-main.yml b/.github/workflows/ci-main.yml index c4b4498..dee1399 100644 --- a/.github/workflows/ci-main.yml +++ b/.github/workflows/ci-main.yml @@ -20,3 +20,34 @@ jobs: - name: Terraform Validate uses: dflook/terraform-validate@v1 + + ci-python: + name: ci-python + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Python Install + uses: actions/setup-python@v3 + with: + python-version: '3.10' + + - name: Project Requirements + run: | + python -m pip install --upgrade pip + python -m pip install -r ./requirements/dev.txt + + - name: Linter with flake8 + run: + flake8 . --ignore E501 + + - name: Unit Test with pytest + run: | + python3 -m pytest -vv --color=yes --cov=./ --cov-report=xml + + - name: Test Coverage with codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/app/src/transformers.py b/app/src/transformers.py index aac5955..54a8e21 100644 --- a/app/src/transformers.py +++ b/app/src/transformers.py @@ -85,6 +85,25 @@ def transform_orders(df: DataFrame) -> DataFrame: weekofyear=True ) + # Selecting attributes + df_orders_prep = df_orders_prep.selectExpr( + "order_id", + "customer_id", + "order_status", + "order_approved_at", + "order_deliv_carrier_dt", + "order_deliv_customer_dt", + "order_estim_deliv_dt", + "order_purchase_ts", + "year_order_purchase_ts", + "quarter_order_purchase_ts", + "month_order_purchase_ts", + "dayofmonth_order_purchase_ts", + "dayofweek_order_purchase_ts", + "dayofyear_order_purchase_ts", + "weekofyear_order_purchase_ts" + ) + return df_orders_prep except Exception as e: diff --git a/app/tests/__init__.py b/app/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/configs/expected_schemas.json b/app/tests/configs/expected_schemas.json new file mode 100644 index 0000000..48955ff --- /dev/null +++ b/app/tests/configs/expected_schemas.json @@ -0,0 +1,14 @@ +{ + "expected": [ + { + "dataframe_reference": "df_orders_prep", + "schema": [ + { + "attribute": "idx", + "dtype": "int", + "nullable": false + } + ] + } + ] +} \ No newline at end of file diff --git a/app/tests/configs/source_schemas.json b/app/tests/configs/source_schemas.json new file mode 100644 index 0000000..02d5eff --- /dev/null +++ b/app/tests/configs/source_schemas.json @@ -0,0 +1,69 @@ +{ + "source": [ + { + "name": "tbl_brecommerce_orders", + "dataframe_reference": "df_orders", + "schema": [ + { + "attribute": "idx", + "dtype": "int", + "nullable": true + }, + { + "attribute": "order_id", + "dtype": "string", + "nullable": true + }, + { + "attribute": "customer_id", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_status", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_purchase_ts", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_approved_at", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_deliv_carrier_dt", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_deliv_customer_dt", + "dtype": "string", + "nullable": true + }, + { + "attribute": "order_estim_deliv_dt", + "dtype": "string", + "nullable": true + } + ], + "empty": false, + "fake_data": false, + "data": [ + [1, "e481f51cbdc54678b7cc49136f2d6af7", "9ef432eb6251297304e76186b10a928d", "delivered", "02/10/2017 10:56", "02/10/2017 11:07", "04/10/2017 19:55", "10/10/2017 21:25", "18/10/2017 00:00"], + [2, "53cdb2fc8bc7dce0b6741e2150273451", "b0830fb4747a6c6d20dea0b8c802d7ef", "delivered", "24/07/2018 20:41", "26/07/2018 03:24", "26/07/2018 14:31", "07/08/2018 15:27", "13/08/2018 00:00"], + [3, "47770eb9100c2d0c44946d9cf07ec65d", "41ce2a54c0b03bf3443c3d931a367089", "delivered", "08/08/2018 08:38", "08/08/2018 08:55", "08/08/2018 13:50", "17/08/2018 18:06", "04/09/2018 00:00"], + [4, "949d5b44dbf5de918fe9c16f97b45f8a", "f88197465ea7920adcdbec7375364d82", "delivered", "18/11/2017 19:28", "18/11/2017 19:45", "22/11/2017 13:39", "02/12/2017 00:28", "15/12/2017 00:00"], + [5, "ad21c59c0840e6cb83a9ceb5573f8159", "8ab97904e6daea8866dbdbc4fb7aad2c", "delivered", "13/02/2018 21:18", "13/02/2018 22:20", "14/02/2018 19:46", "16/02/2018 18:17", "26/02/2018 00:00"], + [6, "a4591c265e18cb1dcee52889e2d8acc3", "503740e9ca751ccdda7ba28e9ab8f608", "delivered", "09/07/2017 21:57", "09/07/2017 22:10", "11/07/2017 14:58", "26/07/2017 10:57", "01/08/2017 00:00"], + [7, "136cce7faa42fdb2cefd53fdc79a6098", "ed0271e0b7da060a393796590e7b737a", "invoiced", "11/04/2017 12:22", "13/04/2017 13:25", "09/05/2017 00:00", "", ""], + [8, "6514b8ad8028c9f2cc2374ded245783f", "9bdf08b4b3b52b5526ff42d37d47f222", "delivered", "16/05/2017 13:10", "16/05/2017 13:22", "22/05/2017 10:07", "26/05/2017 12:55", "07/06/2017 00:00"], + [9, "76c6e866289321a7c93b82b54852dc33", "f54a9f0e6b351c431402b8461ea51999", "delivered", "23/01/2017 18:29", "25/01/2017 02:50", "26/01/2017 14:16", "02/02/2017 14:08", "06/03/2017 00:00"], + [10, "e69bfb5eb88e0ed6a785585b27e16dbf", "31ad1d1b63eb9962463f764d4e6e0c9d", "delivered", "29/07/2017 11:55", "29/07/2017 12:05", "10/08/2017 19:45", "16/08/2017 17:14", "23/08/2017 00:00"] + ] + } + ] +} \ No newline at end of file diff --git a/app/tests/conftest.py b/app/tests/conftest.py new file mode 100644 index 0000000..61857f9 --- /dev/null +++ b/app/tests/conftest.py @@ -0,0 +1,57 @@ +"""Confest file for managing pytest fixtures and other components. + +This file will handle essential components and elements to be used on test +scripts along the project, like features and other things. + +___ +""" + +# Importing libraries +import pytest +import os +import json + +from pyspark.sql import SparkSession, DataFrame + +from tests.helpers.dataframes import create_spark_dataframe_from_json_info + +# from src.transformers import transform_orders + + +# Creating a SparkSession object +spark = SparkSession.builder.getOrCreate() + +# Defining paths for JSON files with infos to create Spark DataFrames +SOURCE_JSON_SCHEMAS_PATH = os.path.join( + os.getcwd(), "app/tests/configs/source_schemas.json" +) + + +# A JSON file loaded with source schema definition +@pytest.fixture() +def json_data_info(): + with open(SOURCE_JSON_SCHEMAS_PATH, "r") as f: + return json.load(f)["source"] + + +# A dictionary with all source DataFrames to be used on the Glue job +@pytest.fixture() +def source_dataframes_dict() -> dict: + return create_spark_dataframe_from_json_info( + json_path=SOURCE_JSON_SCHEMAS_PATH, + spark=spark + ) + + +# A df_orders sample DataFrame +@pytest.fixture() +def df_orders(source_dataframes_dict: dict) -> DataFrame: + return source_dataframes_dict["df_orders"] + + +# A df_orders_prep generated running the transform_orders function +""" +@pytest.fixture() +def df_orders_prep(df_orders) -> DataFrame: + return transform_orders(df=df_orders) +""" diff --git a/app/tests/helpers/__init__.py b/app/tests/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/helpers/dataframes.py b/app/tests/helpers/dataframes.py new file mode 100644 index 0000000..41d9f01 --- /dev/null +++ b/app/tests/helpers/dataframes.py @@ -0,0 +1,185 @@ +"""Helps users to create Spark DataFrames to be used on unit tests. + +This Python file handles useful functions that can be used to create Spark +DataFrames based on JSON files containing definitions about source DataFrames +and expected DataFrames from transformation methods. + +The JSON files must be configured by users and stored on configs/ folder. This +module then defines functions to read those JSON files and return Spark +DataFrames based on how users configured schema information on the files. + +___ +""" + +# Importing libraries +import json + +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType, StructField, StringType,\ + IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType + + +# Getting the active SparkSession +spark = SparkSession.builder.getOrCreate() + + +# Parsing a string for a dtype into a valid Spark dtype +def parse_string_to_spark_dtype(dtype: str): + """Transform a string dtype reference into a valid Spark dtype. + + This function checks for the data type reference for a field given by users + while filling the JSON schema file in order to return a valid Spark dtype + based on the string reference. + + Example: + ```python + # Returning the Spark reference for a "string" data type + spark_dtype = parse_string_to_spark_dtype(dtype="string") + # spark_dtype now holds the StringType Spark dtype object + ``` + + Args: + dtype (str): A string reference for any parseable Spark dtype + + Returns: + A callable Spark dtype object based on the string reference provided + """ + + # Removing noise on string before validating + dtype_prep = dtype.lower().strip() + + # Parsing string reference for dtype to spark data type + if dtype_prep == "string": + return StringType + elif dtype_prep in ("int", "integer"): + return IntegerType + elif dtype_prep == "decimal": + return DecimalType + elif dtype_prep == "float": + return FloatType + elif dtype_prep == "date": + return DateType + elif dtype == "timestamp": + return TimestampType + elif dtype_prep == "boolean": + return BooleanType + else: + raise TypeError(f"Data type {dtype} is not valid or currently " + "parseable into a native Spark dtype") + + +# Creating a valid Spark DataFrame schema from a list with fields information +def create_spark_schema_from_schema_info(schema_info: list) -> StructType: + """Generates a StructType Spark schema based on a list of fields info. + + This function receives a preconfigured Python list extracted from a JSON + schema definition file provided by user in order to return a valid Spark + schema composed by a StructType structure with multiple StructField objects + containing informations about name, data type and nullable info about + attributes. + + Example: + ```python + # Showing an example of a input schema list + schema_list = [ + { + "attribute": "idx", + "dtype": "int", + "nullable": true + }, + { + "attribute": "order_id", + "dtype": "string", + "nullable": true + } + ] + + # Returning a valid Spark schema object based on a dictionary + schema = create_spark_schema_from_dict(schema_info) + ``` + + Args: + schema_info (list): A list with information about fields of a DataFrame + + Returns: + A StructType object structured in such a way that makes it possible to\ + create a Spark DataFrame with a predefined schema. + """ + + # Extracing the schema based on the preconfigured dict info + schema = StructType([ + StructField( + field_info["attribute"], + parse_string_to_spark_dtype(field_info["dtype"])(), + nullable=field_info["nullable"] + ) for field_info in schema_info + ]) + + return schema + + +# Creating a dictionary with DataFrames to mock all sources +def create_spark_dataframe_from_json_info( + json_path: str, + spark: SparkSession = spark, +) -> dict: + """Creates a dictionary of Spark DataFrames based on inputs on a JSON file. + + This function receives the path for a user defined JSON file containing + all information needed to specify all the sources to be on the Glue job + deployed and also testes on the pipeline in order to return a dictionary + of Spark DataFrames based on configs provided by users on the JSON file. + + Example: + ```python + # Defining the path for the JSON file that defines all source data + json_path = "../configs/source_schemas.json" + + # Getting a dictionary of Spark DataFrames based on user configs + source_dataframes = create_spark_dataframe_from_json_info(json_path) + ``` + + Args: + json_path (str): + The path for the JSON file provided by user with all information + needed to create Spark DataFrames for all source data for the job + + spark (pyspark.sql.SparkSession): + A SparkSession object to call Spark methods + + Returns: + A Python dictionary composed by multiple DataFrame objects based on\ + inputs provided by users on the JSON file. + """ + + # Reading JSON file with all schemas definition + with open(json_path, "r") as f: + json_data_info = json.load(f)["source"] + + # Creating an empty dict to store all source DataFrames + sources_dataframes = {} + + # Iterating over all source schemas in order to create Spark DataFrames + for source_data in json_data_info: + # Returning a valid Spark DataFrame schema + schema = create_spark_schema_from_schema_info(source_data["schema"]) + + # Checking if users want to create an empty DataFrame + if source_data["empty"]: + # Creating a list of empty tuples to fill Dataframe with null data + data = [tuple([None] * len(source_data["schema"]))] + else: + # Checks if users want to fill DataFrames with fake data + if source_data["fake_data"]: + pass # ToDo: function to fake data based on dtype using faker + else: + # Using data provided by users in the JSON file + data = [tuple(row) for row in source_data["data"]] + + # Creating a Spark DataFrame and adding a new entry on dictionary + df_reference = source_data["dataframe_reference"] + sources_dataframes[df_reference] = spark.createDataFrame( + data=data, schema=schema + ) + + return sources_dataframes diff --git a/app/tests/samples/__init__.py b/app/tests/samples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/samples/data.py b/app/tests/samples/data.py new file mode 100644 index 0000000..7002d50 --- /dev/null +++ b/app/tests/samples/data.py @@ -0,0 +1,40 @@ +"""Creates fake Spark DataFrames to be used on test cases. + +This module is mainly used by a conftest file for creating fixtures using fake +Spark DataFrames. The idea behind this script file is to provide data to be +used to test features from other library modules. + +___ +""" + +# Importing libraries +from pyspark.sql.types import StringType, IntegerType + +# Creating a dicionary with all infos needed to create sample DataFrames +SAMPLES_DICT = { + "tbl_brecommerce_orders": { + "schema": { + "idx": IntegerType, + "order_id": StringType, + "customer_id": StringType, + "order_status": StringType, + "order_purchase_ts": StringType, + "order_approved_at": StringType, + "order_deliv_carrier_dt": StringType, + "order_deliv_customer_dt": StringType, + "order_estim_deliv_dt": StringType + }, + "data": [ + (1, "e481f51cbdc54678b7cc49136f2d6af7", "9ef432eb6251297304e76186b10a928d", "delivered", "02/10/2017 10:56", "02/10/2017 11:07", "04/10/2017 19:55", "10/10/2017 21:25", "18/10/2017 00:00"), + (2, "53cdb2fc8bc7dce0b6741e2150273451", "b0830fb4747a6c6d20dea0b8c802d7ef", "delivered", "24/07/2018 20:41", "26/07/2018 03:24", "26/07/2018 14:31", "07/08/2018 15:27", "13/08/2018 00:00"), + (3, "47770eb9100c2d0c44946d9cf07ec65d", "41ce2a54c0b03bf3443c3d931a367089", "delivered", "08/08/2018 08:38", "08/08/2018 08:55", "08/08/2018 13:50", "17/08/2018 18:06", "04/09/2018 00:00"), + (4, "949d5b44dbf5de918fe9c16f97b45f8a", "f88197465ea7920adcdbec7375364d82", "delivered", "18/11/2017 19:28", "18/11/2017 19:45", "22/11/2017 13:39", "02/12/2017 00:28", "15/12/2017 00:00"), + (5, "ad21c59c0840e6cb83a9ceb5573f8159", "8ab97904e6daea8866dbdbc4fb7aad2c", "delivered", "13/02/2018 21:18", "13/02/2018 22:20", "14/02/2018 19:46", "16/02/2018 18:17", "26/02/2018 00:00"), + (6, "a4591c265e18cb1dcee52889e2d8acc3", "503740e9ca751ccdda7ba28e9ab8f608", "delivered", "09/07/2017 21:57", "09/07/2017 22:10", "11/07/2017 14:58", "26/07/2017 10:57", "01/08/2017 00:00"), + (7, "136cce7faa42fdb2cefd53fdc79a6098", "ed0271e0b7da060a393796590e7b737a", "invoiced", "11/04/2017 12:22", "13/04/2017 13:25", "09/05/2017 00:00", "", ""), + (8, "6514b8ad8028c9f2cc2374ded245783f", "9bdf08b4b3b52b5526ff42d37d47f222", "delivered", "16/05/2017 13:10", "16/05/2017 13:22", "22/05/2017 10:07", "26/05/2017 12:55", "07/06/2017 00:00"), + (9, "76c6e866289321a7c93b82b54852dc33", "f54a9f0e6b351c431402b8461ea51999", "delivered", "23/01/2017 18:29", "25/01/2017 02:50", "26/01/2017 14:16", "02/02/2017 14:08", "06/03/2017 00:00"), + (10, "e69bfb5eb88e0ed6a785585b27e16dbf", "31ad1d1b63eb9962463f764d4e6e0c9d", "delivered", "29/07/2017 11:55", "29/07/2017 12:05", "10/08/2017 19:45", "16/08/2017 17:14", "23/08/2017 00:00") + ] + } +} diff --git a/app/tests/samples/dataframes.py b/app/tests/samples/dataframes.py new file mode 100644 index 0000000..8972ac3 --- /dev/null +++ b/app/tests/samples/dataframes.py @@ -0,0 +1,64 @@ +"""Helps users to generate their own sample Spark DataFrames. + +This Python file enables users to call functions specially created to make +the process of generating Spark DataFrames easier. It uses a user defined +Python dictionary SAMPLES_DICT on samples.data Python file to iterate over +all information set in order to create a dictionary of Spark DataFrames +objects based on user samples. + +___ +""" + +# Importing libraries +from samples.data import SAMPLES_DICT +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType, StructField + + +# Creating or getting a SparkSession object +spark = SparkSession.builder.getOrCreate() + + +# Defining a function to generate mocked DataFrames based on an info dict +def generate_samples_dataframes( + spark: SparkSession = spark, + samples_dict: dict = SAMPLES_DICT +) -> dict: + """Creates a Spark DataFrame based on a predefined dictionary + + This function uses a user defined Python dictionary with all information + needed to create a Spark DataFrame to be used as a fixture in the conftest + file. The main goal is to enable users to use sample DataFrames to build + custom unit tests. + + Examples: + ```python + # Generating a sample DataFrame + df_sample = generate_samples_dataframes(samples_dict=SAMPLES_DICT) + ``` + """ + + # Creating a Python dictionary to store the samples DataFrame + samples_dfs_dict = {} + + # Iterating over all tables sampled in samples_dict dictionary + for tbl_name, tbl_info in samples_dict.items(): + # Extracing the schema in a way that is readable for Spark + sample_schema = StructType([ + StructField(col, dtype(), nullable=True) + for col, dtype in tbl_info["schema"].items() + ]) + + # Extracting sample data content for the table + sample_data = tbl_info["data"] + + # Creating a Spark DataFrame with sample data + df_sample = spark.createDataFrame( + data=sample_data, + schema=sample_schema + ) + + # Including the sample DataFrame on the final dictionary + samples_dfs_dict[tbl_name] = df_sample + + return samples_dfs_dict diff --git a/app/tests/test_helpers.py b/app/tests/test_helpers.py new file mode 100644 index 0000000..09a5836 --- /dev/null +++ b/app/tests/test_helpers.py @@ -0,0 +1,198 @@ +"""Test cases for auxiliar modules put no helpers folder. + +This file handles all unit tests to check if modules on helpers folder are +working properly in order to provide useful code to help users to create +their own Spark DataFrames to be used on fixtures and test cases. + +___ +""" + +# Importing libraries +import pytest + +from tests.helpers.dataframes import parse_string_to_spark_dtype,\ + create_spark_schema_from_schema_info + +from pyspark.sql import DataFrame +from pyspark.sql.types import StructType, StringType, IntegerType,\ + DecimalType, FloatType, DateType, TimestampType, BooleanType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_string_reference_is_parsed_to_spark_stringtype(): + """ + G: given that users want to parse a "string" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="string" argument + T: then the return object must be a StringType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="string") is StringType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_int_reference_is_parsed_to_spark_integertype(): + """ + G: given that users want to parse a "int" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="int" argument + T: then the return object must be a IntegerType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="int") is IntegerType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_integer_reference_is_parsed_to_spark_integertype(): + """ + G: given that users want to parse a "integer" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="integer" argument + T: then the return object must be a IntegerType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="integer") is IntegerType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_decimal_reference_is_parsed_to_spark_decimaltype(): + """ + G: given that users want to parse a "decimal" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="decimal" argument + T: then the return object must be a DecimalType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="decimal") is DecimalType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_float_reference_is_parsed_to_spark_decimaltype(): + """ + G: given that users want to parse a "float" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="float" argument + T: then the return object must be a FloatType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="float") is FloatType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_date_reference_is_parsed_to_spark_decimaltype(): + """ + G: given that users want to parse a "date" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="date" argument + T: then the return object must be a DateType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="date") is DateType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_timestamp_reference_is_parsed_to_spark_decimaltype(): + """ + G: given that users want to parse a "timestamp" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="timestamp" argument + T: then the return object must be a TimestampType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="timestamp") is TimestampType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +def test_boolean_reference_is_parsed_to_spark_decimaltype(): + """ + G: given that users want to parse a "boolean" reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with + dtype="boolean" argument + T: then the return object must be a BooleanType Spark object + """ + + assert parse_string_to_spark_dtype(dtype="boolean") is BooleanType + + +@pytest.mark.dataframes +@pytest.mark.parse_string_to_spark_dtype +@pytest.mark.exception +def test_typeerror_exception_when_passing_a_incorrect_dtype_string_reference(): + """ + G: given that users want to parse any string reference to a Spark dtype + W: when the function parse_string_to_spark_dtype() is called with an + invalid dtype argument (such as "foo") + T: then a TypeError must be thrown + """ + + with pytest.raises(TypeError): + _ = parse_string_to_spark_dtype(dtype="foo") + + +@pytest.mark.dataframes +@pytest.mark.create_spark_schema_from_schema_info +def test_spark_schema_generated_by_function_is_a_structype_object( + json_data_info +): + """ + G: given that users want to generate a valid Spark schema based on infos + put in a preconfigured JSON file + W: when the function create_spark_schema_from_schema_info() is called with + a preconfigured JSON file passed as an argument for the function + T: then the return must a StructType object representing a Spark schema + """ + + # Getting the first element for the JSON file + sample_source_info = json_data_info[0] + + # Getting a Spark schema from schema info extracted from JSON file + schema = create_spark_schema_from_schema_info( + schema_info=sample_source_info["schema"] + ) + + # Checking if returned schema is a StructType object + assert type(schema) is StructType + + +@pytest.mark.dataframes +@pytest.mark.create_spark_dataframe_from_json_info +def test_function_to_create_spark_dataframes_returns_a_dictionary( + source_dataframes_dict +): + """ + G: given that users want to generate Spark DataFrames based on a + preconfigured JSON file in a specific format + W: when the function create_spark_dataframe_from_json_info() is called + with a path for reading the preconfigured JSON file + T: then the return object must be a Python dictionary + """ + + assert type(source_dataframes_dict) is dict + + +@pytest.mark.dataframes +@pytest.mark.create_spark_dataframe_from_json_info +def test_dataframes_dict_has_spark_dataframes_as_dictionary_values( + source_dataframes_dict +): + """ + G: given that users want to generate Spark DataFrames based on a + preconfigured JSON file in a specific format + W: when the function create_spark_dataframe_from_json_info() is called + with a path for reading the preconfigured JSON file + T: then the value of any arbitrary key of the returned dictionary must + be a Spark DataFrame object + """ + + # Getting any arbitrary key from the dictionary + dict_key = list(source_dataframes_dict.keys())[0] + + assert type(source_dataframes_dict[dict_key]) is DataFrame diff --git a/app/tests/test_transformers.py b/app/tests/test_transformers.py new file mode 100644 index 0000000..0b58114 --- /dev/null +++ b/app/tests/test_transformers.py @@ -0,0 +1,26 @@ +"""Test cases for DAG transformation functions in transformer.py module + +This file handles all unit tests to check if the transformations coded on +transformers.py src module are working properly. The idea is to ensure that +transformations are generating the expected result based on the output +DataFrame schemas. + +___ +""" + +# Importing libraries +import pytest + + +@pytest.mark.transformers +@pytest.mark.skip(reason="Work in progress") +def test_df_orders_transformation_generates_the_expected_dataframe_schema( + df_orders_prep +): + """ + G: + W: + T: + """ + + ... diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..84e7243 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +markers = + exception: Unit tests when an exception is raised + helpers: Unit tests for all modules located on helpers test folder + dataframes: Unit tests related to the generation of DataFrames samples + parse_string_to_spark_dtype: Unit tests for function parse_string_to_spark_dtype() on dataframes.py module + create_spark_schema_from_schema_info: Unit tests for function create_spark_schema_from_schema_info() on dataframes.py module + create_spark_dataframe_from_json_info: Unit tests for function create_spark_dataframe_from_json_info() on dataframes.py module + transformers: Unit test for transformation functions located on the transformers.py application script \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index c17c288..15b0384 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,2 +1,6 @@ flake8 -sparksnake \ No newline at end of file +sparksnake +pytest +pytest-cov +Faker +moto \ No newline at end of file