Skip to content

Commit

Permalink
Merge pull request #42 from ThiagoPanini/develop
Browse files Browse the repository at this point in the history
pr(main): refatoramento de testes unitários
  • Loading branch information
ThiagoPanini authored Jan 24, 2023
2 parents 6b11141 + 2666b9a commit f729574
Show file tree
Hide file tree
Showing 16 changed files with 616 additions and 194 deletions.
Empty file added .github/codecov.yml
Empty file.
5 changes: 4 additions & 1 deletion .github/workflows/ci-terraglue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- develop
- feature**
pull_request:
branches:
- develop
Expand Down Expand Up @@ -65,11 +66,13 @@ jobs:
-u root
-v ${{ github.workspace }}:/home/glue_user/workspace/terraglue
-e DISABLE_SSL=true
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}
-e AWS_REGION=sa-east-1
--rm
-p 4040:4040 -p 18080:18080
--name terraglue
run: |
export AWS_REGION=$(AWS_REGION)
cd terraglue
python3 -m pip install --user --upgrade pip -r app/requirements-container.txt
python3 -m pytest app/ -vv --color=yes --cov=./ --cov-report=xml
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ Todos são muito bem vindos a contribuírem com evoluções e novas funcionalida
- [Eduardo Mendes - Live de Python #168 - Pytest Fixtures](https://www.youtube.com/watch?v=sidi9Z_IkLU&t)
- [Databricks - Data + AI Summit 2022 - Learn to Efficiently Test ETL Pipelines](https://www.youtube.com/watch?v=uzVewG8M6r0&t=1127s)
- [Real Python - Getting Started with Testing in Python](https://realpython.com/python-testing/)
- [Inspired Python - Five Advanced Pytest Fixture Patterns](https://www.inspiredpython.com/article/five-advanced-pytest-fixture-patterns)
- [getmoto/moto - mock inputs](https://github.com/getmoto/moto/blob/master/tests/test_glue/fixtures/datacatalog.py)
- [Codecov - Do test files belong in code coverage calculations?](https://about.codecov.io/blog/should-i-include-test-files-in-code-coverage-calculations/)
- [Jenkins Issue: Endpoint does not contain a valid host name](https://issues.jenkins.io/browse/JENKINS-63177)

**_Outros_**
- [Differences between System of Record and Source of Truth](https://www.linkedin.com/pulse/difference-between-system-record-source-truth-santosh-kudva/)
Expand Down
2 changes: 2 additions & 0 deletions app/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ markers =
terraform: testes relacionados à declarações do usuário em arquivos Terraform usados para criação de recursos na nuvem
job_manager: testes relacionados à classe GlueJobManager responsável por fornecer todos os insumos necessários para a execução de um job Glue na AWS
etl_manager: testes relacionados à classe GlueETLManager responsável por fornecer métodos de transformação de dados e operações gerais utilizando o Glue
generate_dynamicframe_dict: testes relacionados ao método generate_dynamicframe_dict da classe GlueETLManager
generate_dataframe_dict: testes relacionados ao método generate_dataframe_dict da classe GlueETLManager
date_attributes_extraction: testes relacionados ao método date_attributes_extraction da classe GlueETLManager
add_partition: testes relacionados ao método add_partition da classe GlueETLManager
repartition_dataframe: testes relacionados ao método repartition_dataframe da classe GlueETLManager
Expand Down
4 changes: 4 additions & 0 deletions app/requirements-container.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
setuptools
boto3
flake8
pydocstyle
pytest>=7.2.0
pytest-cov
pyparsing
Faker
moto
4 changes: 4 additions & 0 deletions app/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
pyspark
setuptools
boto3
flake8
pydocstyle
pytest>=7.2.0
pytest-cov
pyparsing
Faker
moto
2 changes: 2 additions & 0 deletions app/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

# Bibliotecas utilizadas na construção do módulo
from datetime import datetime

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, count, avg, sum,\
round, countDistinct, max, expr

from terraglue import GlueETLManager, log_config


Expand Down
43 changes: 17 additions & 26 deletions app/src/terraglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@
import sys
import logging
from time import sleep

from pyspark.context import SparkContext
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit, expr

from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
Expand Down Expand Up @@ -218,14 +220,9 @@ def get_context_and_session(self) -> None:
para os elementos SparkContext, GlueContext e SparkSession.
"""
logger.info("Criando SparkContext, GlueContext e SparkSession")
try:
self.sc = SparkContext.getOrCreate()
self.glueContext = GlueContext(self.sc)
self.spark = self.glueContext.spark_session
except Exception as e:
logger.error("Erro ao criar elementos de contexto e sessão "
f"da aplicação. Exception: {e}")
raise e
self.sc = SparkContext.getOrCreate()
self.glueContext = GlueContext(self.sc)
self.spark = self.glueContext.spark_session

def init_job(self) -> Job:
"""
Expand All @@ -250,13 +247,10 @@ def init_job(self) -> Job:
self.get_context_and_session()

# Inicializando objeto de Job do Glue
try:
job = Job(self.glueContext)
job.init(self.args['JOB_NAME'], self.args)
return job
except Exception as e:
logger.error(f"Erro ao inicializar job do Glue. Exception: {e}")
raise e
job = Job(self.glueContext)
job.init(self.args['JOB_NAME'], self.args)

return job


# Classe para o gerenciamento de transformações Spark em um job
Expand Down Expand Up @@ -467,16 +461,12 @@ def generate_dynamic_frames_dict(self) -> dict:

logger.info("Mapeando DynamicFrames às chaves do dicionário")
sleep(0.01)
try:
# Criando dicionário de Dynamic Frames
dynamic_dict = {k: dyf for k, dyf
in zip(self.data_dict.keys(), dynamic_frames)}
logger.info("Dados gerados com sucesso. Total de DynamicFrames: "
f"{len(dynamic_dict.values())}")
except Exception as e:
logger.error("Erro ao mapear DynamicFrames às chaves do "
f"dicionário de dados fornecido. Exception: {e}")
raise e

# Criando dicionário de Dynamic Frames
dynamic_dict = {k: dyf for k, dyf
in zip(self.data_dict.keys(), dynamic_frames)}
logger.info("Dados gerados com sucesso. Total de DynamicFrames: "
f"{len(dynamic_dict.values())}")

# Retornando dicionário de DynamicFrames
sleep(0.01)
Expand Down Expand Up @@ -688,6 +678,7 @@ def date_attributes_extraction(df: DataFrame,
"""
try:
# Criando expressões de conversão com base no tipo do campo
date_col_type = date_col_type.strip().lower()
if convert_string_to_date:
if date_col_type == "date":
conversion_expr = f"to_date({date_col},\
Expand Down Expand Up @@ -728,7 +719,7 @@ def date_attributes_extraction(df: DataFrame,
return df

except Exception as e:
logger.error('Erro ao adicionar coluns em DataFrame com'
logger.error('Erro ao adicionar colunas em DataFrame com'
f'novos atributos de data. Exception: {e}')
raise e

Expand Down
152 changes: 152 additions & 0 deletions app/tests/configs/inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
SCRIPT: configs/inputs.py
CONTEXTO E OBJETIVO:
--------------------
Arquivo de configuração de parâmetros e variáveis
utilizadas nos testes. O usuário deve atentar-se a todas
as configurações e declarações de variáveis aqui
realizadas para que os testes unitários possam ser
executados de maneira adequada.
---------------------------------------------------"""

from pyspark.sql.types import StructType, StructField,\
IntegerType, StringType, BooleanType, DecimalType,\
DateType, TimestampType


"""---------------------------------------------------
------ 1. DEFININDO PARÂMETROS DE CONFIGURAÇÃO -------
1.1 Argumentos do job e entradas do usuário
---------------------------------------------------"""

# Argumentos obrigatórios do job a serem validados
JOB_REQUIRED_ARGS = [
"JOB_NAME",
"OUTPUT_BUCKET",
"OUTPUT_DB",
"OUTPUT_TABLE",
"CONNECTION_TYPE",
"UPDATE_BEHAVIOR",
"DATA_FORMAT",
"COMPRESSION",
"ENABLE_UPDATE_CATALOG"
]

# Nome da variável terraform onde os demais parâmetros são declarados
TF_VAR_NAME_JOB_ARGS = "glue_job_user_arguments"

# Lista de argumentos do job definidos em tempo de execução
JOB_RUNTIME_ARGS = ["JOB_NAME", "OUTPUT_BUCKET"]

# Lista de chaves obrigatórias da variável DATA_DICT em main.py
DATA_DICT_REQUIRED_KEYS = ["database", "table_name", "transformation_ctx"]


"""---------------------------------------------------
------ 2. DEFININDO PARÂMETROS DE CONFIGURAÇÃO -------
2.2 Parâmetros para mock de DataFrame Spark
---------------------------------------------------"""

# Schema para criação de DataFrame fictício Spark
FAKE_DATAFRAME_SCHEMA = StructType([
StructField("id", StringType()),
StructField("value", IntegerType()),
StructField("decimal", DecimalType()),
StructField("boolean", BooleanType()),
StructField("date", DateType()),
StructField("timestamp", TimestampType())
])


"""---------------------------------------------------
------ 2. DEFININDO PARÂMETROS DE CONFIGURAÇÃO -------
2.3 Parâmetros para mock no Data Catalog
---------------------------------------------------"""

# Input para mock de banco de dados
FAKE_CATALOG_DATABASE_INPUT = {
"Name": "db_fake",
"Description": "a fake database",
"LocationUri": "s3://bucket-fake/db_fake",
"Parameters": {},
"CreateTableDefaultPermissions": [
{
"Principal": {"DataLakePrincipalIdentifier": "a_fake_owner"},
"Permissions": ["ALL"],
},
],
}

# Input para mock de tabelas no catálogo
FAKE_CATALOG_TABLE_INPUT = {
"Name": "tbl_fake",
"Description": "Entrada para tabela db_fake.tbl_fake contendo "
"metadados mockados para uso em testes unitários",
"Retention": 0,
"StorageDescriptor": {
"Columns": [
{
"Name": "fake_col_1",
"Type": "string",
"Comment": "",
"Parameters": {}
},
{
"Name": "fake_col_2",
"Type": "string",
"Comment": "",
"Parameters": {}
},
{
"Name": "fake_col_3",
"Type": "string",
"Comment": "",
"Parameters": {}
},
{
"Name": "fake_col_4",
"Type": "string",
"Comment": "",
"Parameters": {}
},
{
"Name": "fake_col_5",
"Type": "string",
"Comment": "",
"Parameters": {}
}
],
"Location": "s3://bucket-fake/db_fake/tbl_fake",
"InputFormat": "org.apache.hadoop.hive.ql.io.parquet"
".MapredParquetInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet."
"MapredParquetOutputFormat",
"Compressed": False,
"NumberOfBuckets": 0,
"SerdeInfo": {
"Name": "main-stream",
"SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet."
"serde.ParquetHiveSerDe",
},
"BucketColumns": [],
"SortColumns": [],
"Parameters": {},
"StoredAsSubDirectories": False
},
"PartitionKeys": [],
"TableType": "EXTERNAL_TABLE",
"Parameters": {
"EXTERNAL": "TRUE"
}
}

# Simulação de data_dict com recursos mockados
FAKE_DATA_DICT = {
"fake": {
"database": "db_fake",
"table_name": "tbl_fake",
"transformation_ctx": "dyf_fake",
"create_temp_view": False
}
}
Loading

0 comments on commit f729574

Please sign in to comment.