test: including test cases for helpers functions (#69)

ThiagoPanini · Apr 29, 2023 · 64a9447 · 64a9447
1 parent bc9a452
commit 64a9447
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 29 deletions.
diff --git a/app/tests/configs/source_schemas.json b/app/tests/configs/source_schemas.json
@@ -50,7 +50,7 @@
                     "nullable": true
                 }
             ],
-            "empty": true,
+            "empty": false,
             "fake_data": false,
             "data": [
                 [1, "e481f51cbdc54678b7cc49136f2d6af7", "9ef432eb6251297304e76186b10a928d", "delivered", "02/10/2017 10:56", "02/10/2017 11:07", "04/10/2017 19:55", "10/10/2017 21:25", "18/10/2017 00:00"],

diff --git a/app/tests/conftest.py b/app/tests/conftest.py
@@ -9,10 +9,11 @@
 # Importing libraries
 import pytest
 import os
+import json
 
 from pyspark.sql import SparkSession, DataFrame
 
-from tests.helpers.dataframes import create_source_dataframes
+from tests.helpers.dataframes import create_spark_dataframe_from_json_info
 
 # from src.transformers import transform_orders
 
@@ -26,11 +27,18 @@
 )
 
 
+# A JSON file loaded with source schema definition
+@pytest.fixture()
+def json_data_info():
+    with open(SOURCE_JSON_SCHEMAS_PATH, "r") as f:
+        return json.load(f)["source"]
+
+
 # A dictionary with all source DataFrames to be used on the Glue job
 @pytest.fixture()
 def source_dataframes_dict() -> dict:
-    return create_source_dataframes(
-        source_schemas_json_path=SOURCE_JSON_SCHEMAS_PATH,
+    return create_spark_dataframe_from_json_info(
+        json_path=SOURCE_JSON_SCHEMAS_PATH,
         spark=spark
     )
 

diff --git a/app/tests/helpers/dataframes.py b/app/tests/helpers/dataframes.py
@@ -16,7 +16,7 @@
 
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType, StructField, StringType,\
-     IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType
+    IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType
 
 
 # Getting the active SparkSession
@@ -69,7 +69,7 @@ def parse_string_to_spark_dtype(dtype: str):
 
 
 # Creating a valid Spark DataFrame schema from a list with fields information
-def create_spark_schema_from_dict(schema_list: list) -> StructType:
+def create_spark_schema_from_schema_info(schema_info: list) -> StructType:
     """Generates a StructType Spark schema based on a list of fields info.
 
     This function receives a preconfigured Python list extracted from a JSON
@@ -95,11 +95,11 @@ def create_spark_schema_from_dict(schema_list: list) -> StructType:
         ]
 
         # Returning a valid Spark schema object based on a dictionary
-        schema = create_spark_schema_from_dict(schema_list)
+        schema = create_spark_schema_from_dict(schema_info)
         ```
 
     Args:
-        schema_list (list): A list with information about DataFrame fields
+        schema_info (list): A list with information about fields of a DataFrame
 
     Returns:
         A StructType object structured in such a way that makes it possible to\
@@ -112,18 +112,18 @@ def create_spark_schema_from_dict(schema_list: list) -> StructType:
             field_info["attribute"],
             parse_string_to_spark_dtype(field_info["dtype"])(),
             nullable=field_info["nullable"]
-        ) for field_info in schema_list
+        ) for field_info in schema_info
     ])
 
     return schema
 
 
 # Creating a dictionary with DataFrames to mock all sources
-def create_source_dataframes(
-    source_schemas_json_path: str,
+def create_spark_dataframe_from_json_info(
+    json_path: str,
     spark: SparkSession = spark,
 ) -> dict:
-    """Creates a dictionary of Spark DataFrames based on inputs on a JSON FILE.
+    """Creates a dictionary of Spark DataFrames based on inputs on a JSON file.
 
     This function receives the path for a user defined JSON file containing
     all information needed to specify all the sources to be on the Glue job
@@ -136,11 +136,11 @@ def create_source_dataframes(
         json_path = "../configs/source_schemas.json"
 
         # Getting a dictionary of Spark DataFrames based on user configs
-        source_dataframes = create_source_dataframes(json_path)
+        source_dataframes = create_spark_dataframe_from_json_info(json_path)
         ```
 
     Args:
-        source_schemas_json_path (str):
+        json_path (str):
             The path for the JSON file provided by user with all information
             needed to create Spark DataFrames for all source data for the job
 
@@ -153,7 +153,7 @@ def create_source_dataframes(
     """
 
     # Reading JSON file with all schemas definition
-    with open(source_schemas_json_path, "r") as f:
+    with open(json_path, "r") as f:
         json_data_info = json.load(f)["source"]
 
     # Creating an empty dict to store all source DataFrames
@@ -162,7 +162,7 @@ def create_source_dataframes(
     # Iterating over all source schemas in order to create Spark DataFrames
     for source_data in json_data_info:
         # Returning a valid Spark DataFrame schema
-        schema = create_spark_schema_from_dict(source_data["schema"])
+        schema = create_spark_schema_from_schema_info(source_data["schema"])
 
         # Checking if users want to create an empty DataFrame
         if source_data["empty"]:

diff --git a/app/tests/test_helpers.py b/app/tests/test_helpers.py
@@ -11,14 +11,15 @@
 import pytest
 
 from tests.helpers.dataframes import parse_string_to_spark_dtype,\
-    create_spark_schema_from_dict, create_source_dataframes
+    create_spark_schema_from_schema_info
 
-from pyspark.sql.types import StructType, StructField, StringType,\
-     IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType
+from pyspark.sql import DataFrame
+from pyspark.sql.types import StructType, StringType, IntegerType,\
+    DecimalType, FloatType, DateType, TimestampType, BooleanType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
+@pytest.mark.parse_string_to_spark_dtype
 def test_string_reference_is_parsed_to_spark_stringtype():
     """
     G: given that users want to parse a "string" reference to a Spark dtype
@@ -30,7 +31,6 @@ def test_string_reference_is_parsed_to_spark_stringtype():
     assert parse_string_to_spark_dtype(dtype="string") is StringType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_int_reference_is_parsed_to_spark_integertype():
@@ -44,7 +44,6 @@ def test_int_reference_is_parsed_to_spark_integertype():
     assert parse_string_to_spark_dtype(dtype="int") is IntegerType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_integer_reference_is_parsed_to_spark_integertype():
@@ -58,7 +57,6 @@ def test_integer_reference_is_parsed_to_spark_integertype():
     assert parse_string_to_spark_dtype(dtype="integer") is IntegerType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_decimal_reference_is_parsed_to_spark_decimaltype():
@@ -72,7 +70,6 @@ def test_decimal_reference_is_parsed_to_spark_decimaltype():
     assert parse_string_to_spark_dtype(dtype="decimal") is DecimalType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_float_reference_is_parsed_to_spark_decimaltype():
@@ -86,7 +83,6 @@ def test_float_reference_is_parsed_to_spark_decimaltype():
     assert parse_string_to_spark_dtype(dtype="float") is FloatType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_date_reference_is_parsed_to_spark_decimaltype():
@@ -100,7 +96,6 @@ def test_date_reference_is_parsed_to_spark_decimaltype():
     assert parse_string_to_spark_dtype(dtype="date") is DateType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_timestamp_reference_is_parsed_to_spark_decimaltype():
@@ -114,7 +109,6 @@ def test_timestamp_reference_is_parsed_to_spark_decimaltype():
     assert parse_string_to_spark_dtype(dtype="timestamp") is TimestampType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 def test_boolean_reference_is_parsed_to_spark_decimaltype():
@@ -128,7 +122,6 @@ def test_boolean_reference_is_parsed_to_spark_decimaltype():
     assert parse_string_to_spark_dtype(dtype="boolean") is BooleanType
 
 
-@pytest.mark.helpers
 @pytest.mark.dataframes
 @pytest.mark.parse_string_to_spark_dtype
 @pytest.mark.exception
@@ -142,3 +135,64 @@ def test_typeerror_exception_when_passing_a_incorrect_dtype_string_reference():
 
     with pytest.raises(TypeError):
         _ = parse_string_to_spark_dtype(dtype="foo")
+
+
+@pytest.mark.dataframes
+@pytest.mark.create_spark_schema_from_schema_info
+def test_spark_schema_generated_by_function_is_a_structype_object(
+    json_data_info
+):
+    """
+    G: given that users want to generate a valid Spark schema based on infos
+       put in a preconfigured JSON file
+    W: when the function create_spark_schema_from_schema_info() is called with
+       a preconfigured JSON file passed as an argument for the function
+    T: then the return must a StructType object representing a Spark schema
+    """
+
+    # Getting the first element for the JSON file
+    sample_source_info = json_data_info[0]
+
+    # Getting a Spark schema from schema info extracted from JSON file
+    schema = create_spark_schema_from_schema_info(
+        schema_info=sample_source_info["schema"]
+    )
+
+    # Checking if returned schema is a StructType object
+    assert type(schema) is StructType
+
+
+@pytest.mark.dataframes
+@pytest.mark.create_spark_dataframe_from_json_info
+def test_function_to_create_spark_dataframes_returns_a_dictionary(
+    source_dataframes_dict
+):
+    """
+    G: given that users want to generate Spark DataFrames based on a
+       preconfigured JSON file in a specific format
+    W: when the function create_spark_dataframe_from_json_info() is called
+       with a path for reading the preconfigured JSON file
+    T: then the return object must be a Python dictionary
+    """
+
+    assert type(source_dataframes_dict) is dict
+
+
+@pytest.mark.dataframes
+@pytest.mark.create_spark_dataframe_from_json_info
+def test_dataframes_dict_has_spark_dataframes_as_dictionary_values(
+    source_dataframes_dict
+):
+    """
+    G: given that users want to generate Spark DataFrames based on a
+       preconfigured JSON file in a specific format
+    W: when the function create_spark_dataframe_from_json_info() is called
+       with a path for reading the preconfigured JSON file
+    T: then the value of any arbitrary key of the returned dictionary must
+       be a Spark DataFrame object
+    """
+
+    # Getting any arbitrary key from the dictionary
+    dict_key = list(source_dataframes_dict.keys())[0]
+
+    assert type(source_dataframes_dict[dict_key]) is DataFrame
diff --git a/app/tests/test_transformers.py b/app/tests/test_transformers.py
@@ -22,5 +22,5 @@ def test_df_orders_transformation_generates_the_expected_dataframe_schema(
     W:
     T:
     """
-    
+
     ...
diff --git a/pytest.ini b/pytest.ini
@@ -4,4 +4,6 @@ markers =
     helpers: Unit tests for all modules located on helpers test folder
     dataframes: Unit tests related to the generation of DataFrames samples
     parse_string_to_spark_dtype: Unit tests for function parse_string_to_spark_dtype() on dataframes.py module
+    create_spark_schema_from_schema_info: Unit tests for function create_spark_schema_from_schema_info() on dataframes.py module
+    create_spark_dataframe_from_json_info: Unit tests for function create_spark_dataframe_from_json_info() on dataframes.py module
     transformers: Unit test for transformation functions located on the transformers.py application script