Skip to content

Commit

Permalink
test: including test cases for helpers functions (#69)
Browse files Browse the repository at this point in the history
  • Loading branch information
ThiagoPanini committed Apr 29, 2023
1 parent bc9a452 commit 64a9447
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 29 deletions.
2 changes: 1 addition & 1 deletion app/tests/configs/source_schemas.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"nullable": true
}
],
"empty": true,
"empty": false,
"fake_data": false,
"data": [
[1, "e481f51cbdc54678b7cc49136f2d6af7", "9ef432eb6251297304e76186b10a928d", "delivered", "02/10/2017 10:56", "02/10/2017 11:07", "04/10/2017 19:55", "10/10/2017 21:25", "18/10/2017 00:00"],
Expand Down
14 changes: 11 additions & 3 deletions app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
# Importing libraries
import pytest
import os
import json

from pyspark.sql import SparkSession, DataFrame

from tests.helpers.dataframes import create_source_dataframes
from tests.helpers.dataframes import create_spark_dataframe_from_json_info

# from src.transformers import transform_orders

Expand All @@ -26,11 +27,18 @@
)


# A JSON file loaded with source schema definition
@pytest.fixture()
def json_data_info():
with open(SOURCE_JSON_SCHEMAS_PATH, "r") as f:
return json.load(f)["source"]


# A dictionary with all source DataFrames to be used on the Glue job
@pytest.fixture()
def source_dataframes_dict() -> dict:
return create_source_dataframes(
source_schemas_json_path=SOURCE_JSON_SCHEMAS_PATH,
return create_spark_dataframe_from_json_info(
json_path=SOURCE_JSON_SCHEMAS_PATH,
spark=spark
)

Expand Down
24 changes: 12 additions & 12 deletions app/tests/helpers/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,\
IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType
IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType


# Getting the active SparkSession
Expand Down Expand Up @@ -69,7 +69,7 @@ def parse_string_to_spark_dtype(dtype: str):


# Creating a valid Spark DataFrame schema from a list with fields information
def create_spark_schema_from_dict(schema_list: list) -> StructType:
def create_spark_schema_from_schema_info(schema_info: list) -> StructType:
"""Generates a StructType Spark schema based on a list of fields info.
This function receives a preconfigured Python list extracted from a JSON
Expand All @@ -95,11 +95,11 @@ def create_spark_schema_from_dict(schema_list: list) -> StructType:
]
# Returning a valid Spark schema object based on a dictionary
schema = create_spark_schema_from_dict(schema_list)
schema = create_spark_schema_from_dict(schema_info)
```
Args:
schema_list (list): A list with information about DataFrame fields
schema_info (list): A list with information about fields of a DataFrame
Returns:
A StructType object structured in such a way that makes it possible to\
Expand All @@ -112,18 +112,18 @@ def create_spark_schema_from_dict(schema_list: list) -> StructType:
field_info["attribute"],
parse_string_to_spark_dtype(field_info["dtype"])(),
nullable=field_info["nullable"]
) for field_info in schema_list
) for field_info in schema_info
])

return schema


# Creating a dictionary with DataFrames to mock all sources
def create_source_dataframes(
source_schemas_json_path: str,
def create_spark_dataframe_from_json_info(
json_path: str,
spark: SparkSession = spark,
) -> dict:
"""Creates a dictionary of Spark DataFrames based on inputs on a JSON FILE.
"""Creates a dictionary of Spark DataFrames based on inputs on a JSON file.
This function receives the path for a user defined JSON file containing
all information needed to specify all the sources to be on the Glue job
Expand All @@ -136,11 +136,11 @@ def create_source_dataframes(
json_path = "../configs/source_schemas.json"
# Getting a dictionary of Spark DataFrames based on user configs
source_dataframes = create_source_dataframes(json_path)
source_dataframes = create_spark_dataframe_from_json_info(json_path)
```
Args:
source_schemas_json_path (str):
json_path (str):
The path for the JSON file provided by user with all information
needed to create Spark DataFrames for all source data for the job
Expand All @@ -153,7 +153,7 @@ def create_source_dataframes(
"""

# Reading JSON file with all schemas definition
with open(source_schemas_json_path, "r") as f:
with open(json_path, "r") as f:
json_data_info = json.load(f)["source"]

# Creating an empty dict to store all source DataFrames
Expand All @@ -162,7 +162,7 @@ def create_source_dataframes(
# Iterating over all source schemas in order to create Spark DataFrames
for source_data in json_data_info:
# Returning a valid Spark DataFrame schema
schema = create_spark_schema_from_dict(source_data["schema"])
schema = create_spark_schema_from_schema_info(source_data["schema"])

# Checking if users want to create an empty DataFrame
if source_data["empty"]:
Expand Down
78 changes: 66 additions & 12 deletions app/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
import pytest

from tests.helpers.dataframes import parse_string_to_spark_dtype,\
create_spark_schema_from_dict, create_source_dataframes
create_spark_schema_from_schema_info

from pyspark.sql.types import StructType, StructField, StringType,\
IntegerType, DecimalType, FloatType, DateType, TimestampType, BooleanType
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StringType, IntegerType,\
DecimalType, FloatType, DateType, TimestampType, BooleanType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_string_reference_is_parsed_to_spark_stringtype():
"""
G: given that users want to parse a "string" reference to a Spark dtype
Expand All @@ -30,7 +31,6 @@ def test_string_reference_is_parsed_to_spark_stringtype():
assert parse_string_to_spark_dtype(dtype="string") is StringType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_int_reference_is_parsed_to_spark_integertype():
Expand All @@ -44,7 +44,6 @@ def test_int_reference_is_parsed_to_spark_integertype():
assert parse_string_to_spark_dtype(dtype="int") is IntegerType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_integer_reference_is_parsed_to_spark_integertype():
Expand All @@ -58,7 +57,6 @@ def test_integer_reference_is_parsed_to_spark_integertype():
assert parse_string_to_spark_dtype(dtype="integer") is IntegerType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_decimal_reference_is_parsed_to_spark_decimaltype():
Expand All @@ -72,7 +70,6 @@ def test_decimal_reference_is_parsed_to_spark_decimaltype():
assert parse_string_to_spark_dtype(dtype="decimal") is DecimalType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_float_reference_is_parsed_to_spark_decimaltype():
Expand All @@ -86,7 +83,6 @@ def test_float_reference_is_parsed_to_spark_decimaltype():
assert parse_string_to_spark_dtype(dtype="float") is FloatType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_date_reference_is_parsed_to_spark_decimaltype():
Expand All @@ -100,7 +96,6 @@ def test_date_reference_is_parsed_to_spark_decimaltype():
assert parse_string_to_spark_dtype(dtype="date") is DateType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_timestamp_reference_is_parsed_to_spark_decimaltype():
Expand All @@ -114,7 +109,6 @@ def test_timestamp_reference_is_parsed_to_spark_decimaltype():
assert parse_string_to_spark_dtype(dtype="timestamp") is TimestampType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
def test_boolean_reference_is_parsed_to_spark_decimaltype():
Expand All @@ -128,7 +122,6 @@ def test_boolean_reference_is_parsed_to_spark_decimaltype():
assert parse_string_to_spark_dtype(dtype="boolean") is BooleanType


@pytest.mark.helpers
@pytest.mark.dataframes
@pytest.mark.parse_string_to_spark_dtype
@pytest.mark.exception
Expand All @@ -142,3 +135,64 @@ def test_typeerror_exception_when_passing_a_incorrect_dtype_string_reference():

with pytest.raises(TypeError):
_ = parse_string_to_spark_dtype(dtype="foo")


@pytest.mark.dataframes
@pytest.mark.create_spark_schema_from_schema_info
def test_spark_schema_generated_by_function_is_a_structype_object(
json_data_info
):
"""
G: given that users want to generate a valid Spark schema based on infos
put in a preconfigured JSON file
W: when the function create_spark_schema_from_schema_info() is called with
a preconfigured JSON file passed as an argument for the function
T: then the return must a StructType object representing a Spark schema
"""

# Getting the first element for the JSON file
sample_source_info = json_data_info[0]

# Getting a Spark schema from schema info extracted from JSON file
schema = create_spark_schema_from_schema_info(
schema_info=sample_source_info["schema"]
)

# Checking if returned schema is a StructType object
assert type(schema) is StructType


@pytest.mark.dataframes
@pytest.mark.create_spark_dataframe_from_json_info
def test_function_to_create_spark_dataframes_returns_a_dictionary(
source_dataframes_dict
):
"""
G: given that users want to generate Spark DataFrames based on a
preconfigured JSON file in a specific format
W: when the function create_spark_dataframe_from_json_info() is called
with a path for reading the preconfigured JSON file
T: then the return object must be a Python dictionary
"""

assert type(source_dataframes_dict) is dict


@pytest.mark.dataframes
@pytest.mark.create_spark_dataframe_from_json_info
def test_dataframes_dict_has_spark_dataframes_as_dictionary_values(
source_dataframes_dict
):
"""
G: given that users want to generate Spark DataFrames based on a
preconfigured JSON file in a specific format
W: when the function create_spark_dataframe_from_json_info() is called
with a path for reading the preconfigured JSON file
T: then the value of any arbitrary key of the returned dictionary must
be a Spark DataFrame object
"""

# Getting any arbitrary key from the dictionary
dict_key = list(source_dataframes_dict.keys())[0]

assert type(source_dataframes_dict[dict_key]) is DataFrame
2 changes: 1 addition & 1 deletion app/tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ def test_df_orders_transformation_generates_the_expected_dataframe_schema(
W:
T:
"""

...
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ markers =
helpers: Unit tests for all modules located on helpers test folder
dataframes: Unit tests related to the generation of DataFrames samples
parse_string_to_spark_dtype: Unit tests for function parse_string_to_spark_dtype() on dataframes.py module
create_spark_schema_from_schema_info: Unit tests for function create_spark_schema_from_schema_info() on dataframes.py module
create_spark_dataframe_from_json_info: Unit tests for function create_spark_dataframe_from_json_info() on dataframes.py module
transformers: Unit test for transformation functions located on the transformers.py application script

0 comments on commit 64a9447

Please sign in to comment.