diff --git a/app/pytest.ini b/app/pytest.ini index b55734e..95bb07a 100644 --- a/app/pytest.ini +++ b/app/pytest.ini @@ -7,4 +7,11 @@ markers = terraform: testes relacionados à declarações do usuário em arquivos Terraform usados para criação de recursos na nuvem date_attributes_extraction: testes relacionados ao método date_attributes_extraction da classe GlueETLManager add_partition: testes relacionados ao método add_partition da classe GlueETLManager - repartition_dataframe: testes relacionados ao método repartition_dataframe da classe GlueETLManager \ No newline at end of file + repartition_dataframe: testes relacionados ao método repartition_dataframe da classe GlueETLManager + main: testes relacionados ao script principal da aplicação Spark implantada + orders: testes relacionados às transformações vinculadas ao DataFrame df_orders + order_items: testes relacionados às transformações vinculadas ao DataFrame df_order_items + customers: testes relacionados às transformações vinculadas ao DataFrame df_customers + payments: testes relacionados às transformações vinculadas ao DataFrame df_payments + reviews: testes relacionados às transformações vinculadas ao DataFrame df_reviews + sot: testes relacionados às transformações vinculadas ao DataFrame df_sot \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt deleted file mode 100644 index 084fd52..0000000 --- a/app/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pyspark -pytest -Faker \ No newline at end of file diff --git a/app/requirements_test_container.txt b/app/requirements_test_container.txt new file mode 100644 index 0000000..ecea547 --- /dev/null +++ b/app/requirements_test_container.txt @@ -0,0 +1,3 @@ +pytest>=7.2.0 +Faker +flake8 \ No newline at end of file diff --git a/app/src/main.py b/app/src/main.py index b21d0f9..1f92627 100644 --- a/app/src/main.py +++ b/app/src/main.py @@ -401,8 +401,6 @@ def transform_sot(self, **kwargs) -> DataFrame: "max_price_order_item", "avg_freight_value_order", "max_order_shipping_limit_date", - "customer_unique_id", - "customer_zip_code_prefix", "customer_city", "customer_state", "installments", @@ -456,6 +454,7 @@ def run(self) -> None: # Transformando dados df_orders_prep = self.transform_orders(df=df_orders) df_order_items_prep = self.transform_order_items(df=df_order_items) + df_customers_prep = self.transform_customers(df=df_customers) df_payments_prep = self.transform_payments(df=df_payments) df_reviews_prep = self.transform_reviews(df=df_reviews) @@ -463,7 +462,7 @@ def run(self) -> None: df_sot_prep = self.transform_sot( df_orders_prep=df_orders_prep, df_order_items_prep=df_order_items_prep, - df_customers_prep=df_customers, + df_customers_prep=df_customers_prep, df_payments_prep=df_payments_prep, df_reviews_prep=df_reviews_prep ) diff --git a/app/tests/conftest.py b/app/tests/conftest.py index 3f6a5e4..ed540d7 100644 --- a/app/tests/conftest.py +++ b/app/tests/conftest.py @@ -20,8 +20,9 @@ # Importando módulos para uso import sys +import os from pytest import fixture -from src.main import ARGV_LIST, DATA_DICT +from src.main import ARGV_LIST, DATA_DICT, GlueTransformationManager from src.terraglue import GlueJobManager, GlueETLManager from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType,\ @@ -30,7 +31,6 @@ from tests.utils.spark_helper import generate_fake_spark_dataframe from faker import Faker - # Instanciando objeto Faker faker = Faker() Faker.seed(42) @@ -169,3 +169,163 @@ def fake_dataframe(spark): spark=spark, schema_input=schema ) + + +"""--------------------------------------------------- +----------- 2. DEFINIÇÃO DE FIXTURES ÚTEIS ----------- + 2.4 Fixtures utilizadas em test_main +---------------------------------------------------""" + + +# Objeto instanciado da classe GlueETLManager +@fixture() +def glue_manager(job_args_for_testing): + # Adicionando argumentos ao vetor de argumentos + for arg_name, arg_value in job_args_for_testing.items(): + sys.argv.append(f"--{arg_name}={arg_value}") + + glue_manager = GlueTransformationManager( + argv_list=ARGV_LIST, + data_dict=DATA_DICT + ) + + return glue_manager + + +# Amostra de DataFrame df_orders +@fixture() +def df_orders(spark): + # Definindo variável para leitura do DataFrame + filename = "sample_olist_orders_dataset.csv" + data_path = os.path.join( + os.getcwd(), + f"app/tests/samples/{filename}" + ) + + # Realizando a leitura do DataFrame + df = spark.read.format("csv")\ + .option("header", "true")\ + .option("inferSchema", "false")\ + .load(data_path) + + return df + + +# Resultado do método de transformação df_orders_prep +@fixture() +def df_orders_prep(glue_manager, df_orders): + return glue_manager.transform_orders(df_orders) + + +# Amostra de DataFrame df_order_items +@fixture() +def df_order_items(spark): + # Definindo variável para leitura do DataFrame + filename = "sample_olist_order_items_dataset.csv" + data_path = os.path.join( + os.getcwd(), + f"app/tests/samples/{filename}" + ) + + # Realizando a leitura do DataFrame + df = spark.read.format("csv")\ + .option("header", "true")\ + .option("inferSchema", "false")\ + .load(data_path) + + return df + + +# Resultado do método de transformação df_order_items_prep +@fixture() +def df_order_items_prep(glue_manager, df_order_items): + return glue_manager.transform_order_items(df_order_items) + + +# Amostra de DataFrame df_customers +@fixture() +def df_customers(spark): + # Definindo variável para leitura do DataFrame + filename = "sample_olist_customers_dataset.csv" + data_path = os.path.join( + os.getcwd(), + f"app/tests/samples/{filename}" + ) + + # Realizando a leitura do DataFrame + df = spark.read.format("csv")\ + .option("header", "true")\ + .option("inferSchema", "false")\ + .load(data_path) + + return df + + +# Resultado do método de transformação df_customers_prep +@fixture() +def df_customers_prep(glue_manager, df_customers): + return glue_manager.transform_customers(df_customers) + + +# Amostra de DataFrame df_payments +@fixture() +def df_payments(spark): + # Definindo variável para leitura do DataFrame + filename = "sample_olist_order_payments_dataset.csv" + data_path = os.path.join( + os.getcwd(), + f"app/tests/samples/{filename}" + ) + + # Realizando a leitura do DataFrame + df = spark.read.format("csv")\ + .option("header", "true")\ + .option("inferSchema", "false")\ + .load(data_path) + + return df + + +# Resultado do método de transformação df_payments_prep +@fixture() +def df_payments_prep(glue_manager, df_payments): + return glue_manager.transform_payments(df_payments) + + +# Amostra de DataFrame df_reviews +@fixture() +def df_reviews(spark): + # Definindo variável para leitura do DataFrame + filename = "sample_olist_order_reviews_dataset.csv" + data_path = os.path.join( + os.getcwd(), + f"app/tests/samples/{filename}" + ) + + # Realizando a leitura do DataFrame + df = spark.read.format("csv")\ + .option("header", "true")\ + .option("inferSchema", "false")\ + .load(data_path) + + return df + + +# Resultado do método de transformação df_reviews_prep +@fixture() +def df_reviews_prep(glue_manager, df_reviews): + return glue_manager.transform_reviews(df_reviews) + + +# Resultado do método de transformação df_sot_prep +@fixture() +def df_sot_prep(glue_manager, df_orders_prep, df_order_items_prep, + df_customers_prep, df_payments_prep, + df_reviews_prep): + return glue_manager.transform_sot( + df_orders_prep=df_orders_prep, + df_order_items_prep=df_order_items_prep, + df_customers_prep=df_customers_prep, + df_payments_prep=df_payments_prep, + df_reviews_prep=df_reviews_prep + ) diff --git a/app/tests/samples/sample_olist_customers_dataset.csv b/app/tests/samples/sample_olist_customers_dataset.csv new file mode 100644 index 0000000..44de36e --- /dev/null +++ b/app/tests/samples/sample_olist_customers_dataset.csv @@ -0,0 +1,11 @@ +"customer_id","customer_unique_id","customer_zip_code_prefix","customer_city","customer_state" +"8bb3bef4e75a95524235cdc11a7331af",d1f24d1d504e27bee13b415e40daeab0,"40055",salvador,BA +d987da9fb4086ab7c2c0f83963cd6722,"870a0bdc769f9a7870309036740e79ea","02929",sao paulo,SP +"2430ad4b1b6efb56cf3050b5d3cf5e54",fa78b26c2fa23b2ebda98c2926305c1d,"61979",amanari,CE +"6031cd91d182925af3d38ae9590e5afa","34acad212e30cd0d511be1034e2f9821","14150",serrana,SP +e1f35a414cbae52d09c294b3e58c3e89,a6f9ff98ef3cedac9d8a2b88afc89972,"75144",anapolis,GO +"0825646a316d8b2bdddea079a5e01fda",be2cff6c84f1683300337ecd499992e0,"04001",sao paulo,SP +afc13494642f88d253be56a1e353e261,"2b22fac410c77b08b937809c68e7481e","12955",bom jesus dos perdoes,SP +b626b511cecb256e0d1514d883084a38,"1373e04979cfa0fb2092909abbd57f25","45400",valenca,BA +ca9a6ae226341827c9614ce7568db46c,"15b521471c36ed411359347ff8257b79","03080",sao paulo,SP +"4930dfe106be258618f6907e8ce8795d",e26c256b09efa85577c1c600cf1e9bea,"05269",sao paulo,SP \ No newline at end of file diff --git a/app/tests/samples/sample_olist_order_items_dataset.csv b/app/tests/samples/sample_olist_order_items_dataset.csv new file mode 100644 index 0000000..02deb5f --- /dev/null +++ b/app/tests/samples/sample_olist_order_items_dataset.csv @@ -0,0 +1,14 @@ +"order_id","order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value" +"001ab0a7578dd66cd4b0a71f5b6e1e41",1,"0b0172eb0fd18479d29c3bc122c058c2","5656537e588803a555b8eb41f07a944b",2018-01-04 02:33:42,24.89,17.63 +"001ab0a7578dd66cd4b0a71f5b6e1e41",2,"0b0172eb0fd18479d29c3bc122c058c2","5656537e588803a555b8eb41f07a944b",2018-01-04 02:33:42,24.89,17.63 +"001ab0a7578dd66cd4b0a71f5b6e1e41",3,"0b0172eb0fd18479d29c3bc122c058c2","5656537e588803a555b8eb41f07a944b",2018-01-04 02:33:42,24.89,17.63 +"001d8f0e34a38c37f7dba2a37d4eba8b",1,e67307ff0f15ade43fcb6e670be7a74c,f4aba7c0bca51484c30ab7bdc34bcdd1,2017-05-18 17:35:11,18.99,7.78 +"001d8f0e34a38c37f7dba2a37d4eba8b",2,e67307ff0f15ade43fcb6e670be7a74c,f4aba7c0bca51484c30ab7bdc34bcdd1,2017-05-18 17:35:11,18.99,7.78 +"0025c5d1a8ca53a240ec2634bb4492ea",1,"35537536ed2b4c561b4018bf3abf54e0","955fee9216a65b617aa5c0531780ce60",2018-07-10 09:30:09,390.00,29.39 +"006f7dfffe2d90809598e8f1972b829b",1,aacfae7cd4bac4849766f640abf2db8a,"729b2d09b2a0bdab221076327f13d050",2018-03-28 23:07:23,39.85,12.79 +"0078a358a14592b887eb140ef515f5ab",1,"722f84416177a451c3be217ef8ffa082",cca3071e3e9bb7d12640c9fbe2301306,2017-11-10 15:55:43,253.52,82.86 +"00921e4911895b93c7b4fc0d80c0815e",1,cbecf0dca7a42c56c9ad9e20c74af1fd,"688756f717c462a206ad854c5027a64a",2018-06-19 14:17:26,85.00,11.62 +"00b2d2f2b5f7b98e6b1828764660134e",1,e0d64dcfaa3b6db5c54ca298ae101d05,"7d13fca15225358621be4086e1eb0964",2018-08-20 08:50:19,146.01,13.68 +"00b30bb163474583c14db1689259cf4d",1,"3552627a68384dc559f0fd4cce173269","3c487ae8f8d7542beff5788e2e0aea83",2018-02-06 14:13:31,189.90,26.61 +"00b676b01c289cc661c6f7732492771a",1,"121b9686b9929855d823981fc655a6fe",c4fb51fb1c5b7c07bc5e67be6e7e8f6e,2017-08-08 15:45:15,50.00,21.19 +"00c2335723b9b74668062e946dc66621",1,b944aabf1fc45c01599ee96c7f4d533e,"128639473a139ac0f3e5f5ade55873a5",2018-01-09 04:28:51,18.90,12.48 \ No newline at end of file diff --git a/app/tests/samples/sample_olist_order_payments_dataset.csv b/app/tests/samples/sample_olist_order_payments_dataset.csv new file mode 100644 index 0000000..d0176da --- /dev/null +++ b/app/tests/samples/sample_olist_order_payments_dataset.csv @@ -0,0 +1,11 @@ +"order_id","payment_sequential","payment_type","payment_installments","payment_value" +"001ab0a7578dd66cd4b0a71f5b6e1e41",1,boleto,1,127.56 +"001d8f0e34a38c37f7dba2a37d4eba8b",1,credit_card,2,53.54 +"0025c5d1a8ca53a240ec2634bb4492ea",1,credit_card,7,419.39 +"006f7dfffe2d90809598e8f1972b829b",1,credit_card,3,52.64 +"0078a358a14592b887eb140ef515f5ab",1,credit_card,3,336.38 +"00921e4911895b93c7b4fc0d80c0815e",1,credit_card,1,96.62 +"00b2d2f2b5f7b98e6b1828764660134e",1,credit_card,4,159.69 +"00b30bb163474583c14db1689259cf4d",1,credit_card,10,216.51 +"00b676b01c289cc661c6f7732492771a",1,credit_card,1,71.19 +"00c2335723b9b74668062e946dc66621",1,boleto,1,31.38 \ No newline at end of file diff --git a/app/tests/samples/sample_olist_order_reviews_dataset.csv b/app/tests/samples/sample_olist_order_reviews_dataset.csv new file mode 100644 index 0000000..d57824d --- /dev/null +++ b/app/tests/samples/sample_olist_order_reviews_dataset.csv @@ -0,0 +1,11 @@ +"review_id","order_id","review_score","review_comment_title","review_comment_message","review_creation_date","review_answer_timestamp" +"68b49cfcd9420c6ad09af97ea8268e7c","001ab0a7578dd66cd4b0a71f5b6e1e41",4,,Loja rápida na entrega. Só houve um incidente pelo fato de não vim o produto escolhido e sim um similar.,2018-01-18 00:00:00,2018-01-25 03:07:10 +b8fede4fbe6126f9f85ebdd23166","001d8f0e34a38c37f7dba2a37d4eba8b",1,,Entrega prometida 24/05/17. Dia 26/05/17 não havia recebido ainda.,2017-05-26 00:00:00,2017-05-26 20:19:13 +"eba375fa5fe6f3dc7ca2aa6682b46170","0025c5d1a8ca53a240ec2634bb4492ea",5,sensacional,"ótimo produto, bom acabamento. Para quem está começando nesse estudo sobre o universo, principalmente estudantes é uma ótima ferramenta. Tenho feito vários estudos com o auxilio do telescópio.",2018-08-01 00:00:00,2018-08-03 15:21:12 +"6d61a5411b9e9cd49b469a1ca07834c3","006f7dfffe2d90809598e8f1972b829b",2,,Demorou para entregar e quando chegou veio errado. ,2018-04-06 00:00:00,2018-04-17 23:03:40 +"2820a27cb81757fc08f3241188130070","0078a358a14592b887eb140ef515f5ab",5,,"Muito lindo, ficou perfeito na minha cama. +"8928004690de207823c49726ee9d53df","00921e4911895b93c7b4fc0d80c0815e",5,,,2018-06-21 00:00:00,2018-06-25 14:09:28 +"7ecb82e62756c3a6b40c9db6a00b9b9d","00b2d2f2b5f7b98e6b1828764660134e",4,RECOMENDO ,"RELOGIO LINDO ,POREM ELE É PEQUENO ,ESTOU ACOSTUMADA COM RELOGIOS TIPO FAUSTAO RS ,MAS ESSE BEM LEVE ,SÓ QUE VEIO CAIXA DE PAPEL BEM INFERIOR POR SER CASIO ",2018-08-16 00:00:00,2018-08-17 11:00:30 +"4be0fcb2fc18e0da288ad4f45c38059e","00b30bb163474583c14db1689259cf4d",5,,Recomendo!!,2018-02-18 00:00:00,2018-02-21 01:43:34 +"16561345af89c9e66c13a051f4f15d65","00b676b01c289cc661c6f7732492771a",3,,O kit comprado tinha 8 módulos e só vieram 7. Mandei um e mail para o fornecedor e até agora não obtive resposta.,2017-08-16 00:00:00,2017-08-16 17:05:07 +"a5f8ba2a5bba6457fc5edfd182710120","00c2335723b9b74668062e946dc66621",5,,Entregue antes do prazo e material de boa qualidade... recomendo,2018-01-10 00:00:00,2018-01-10 18:29:23 \ No newline at end of file diff --git a/app/tests/samples/sample_olist_orders_dataset.csv b/app/tests/samples/sample_olist_orders_dataset.csv new file mode 100644 index 0000000..726d6b0 --- /dev/null +++ b/app/tests/samples/sample_olist_orders_dataset.csv @@ -0,0 +1,11 @@ +"order_id","customer_id","order_status","order_purchase_timestamp","order_approved_at","order_delivered_carrier_date","order_delivered_customer_date","order_estimated_delivery_date" +"001ab0a7578dd66cd4b0a71f5b6e1e41","8bb3bef4e75a95524235cdc11a7331af",delivered,2017-12-27 00:38:47,2017-12-28 02:33:42,2017-12-28 18:09:35,2018-01-17 20:43:29,2018-01-29 00:00:00 +"001d8f0e34a38c37f7dba2a37d4eba8b",d987da9fb4086ab7c2c0f83963cd6722,delivered,2017-05-14 17:19:44,2017-05-14 17:35:11,2017-05-24 15:45:01,2017-05-26 13:14:50,2017-05-24 00:00:00 +"0025c5d1a8ca53a240ec2634bb4492ea","2430ad4b1b6efb56cf3050b5d3cf5e54",delivered,2018-07-08 09:17:59,2018-07-08 09:30:09,2018-07-10 13:36:00,2018-07-31 10:51:58,2018-08-03 00:00:00 +"006f7dfffe2d90809598e8f1972b829b","6031cd91d182925af3d38ae9590e5afa",delivered,2018-03-22 22:52:46,2018-03-22 23:07:23,2018-03-31 14:18:55,2018-04-05 19:18:35,2018-04-11 00:00:00 +"0078a358a14592b887eb140ef515f5ab",e1f35a414cbae52d09c294b3e58c3e89,delivered,2017-11-06 15:42:42,2017-11-06 22:36:39,2017-11-08 12:24:40,2017-11-21 19:04:54,2017-11-29 00:00:00 +"00921e4911895b93c7b4fc0d80c0815e","0825646a316d8b2bdddea079a5e01fda",delivered,2018-06-13 13:55:55,2018-06-13 14:17:26,2018-06-19 14:02:00,2018-06-20 15:21:57,2018-06-25 00:00:00 +"00b2d2f2b5f7b98e6b1828764660134e",afc13494642f88d253be56a1e353e261,delivered,2018-08-09 08:27:54,2018-08-09 08:50:19,2018-08-10 17:15:00,2018-08-15 20:20:42,2018-08-30 00:00:00 +"00b30bb163474583c14db1689259cf4d",b626b511cecb256e0d1514d883084a38,delivered,2018-01-31 13:57:27,2018-01-31 14:13:31,2018-02-06 17:47:50,2018-02-17 18:36:59,2018-03-05 00:00:00 +"00b676b01c289cc661c6f7732492771a",ca9a6ae226341827c9614ce7568db46c,delivered,2017-08-02 15:30:42,2017-08-02 15:45:15,2017-08-08 15:37:43,2017-08-15 11:22:26,2017-08-24 00:00:00 +"00c2335723b9b74668062e946dc66621","4930dfe106be258618f6907e8ce8795d",delivered,2017-12-31 01:31:16,2018-01-03 04:28:51,2018-01-03 18:05:49,2018-01-09 20:19:21,2018-01-29 00:00:00 \ No newline at end of file diff --git a/app/tests/test_04_main.py b/app/tests/test_04_main.py new file mode 100644 index 0000000..500e8d2 --- /dev/null +++ b/app/tests/test_04_main.py @@ -0,0 +1,435 @@ +""" +SCRIPT: test_main.py + +CONTEXTO: +--------- +Script de testes criado para validar etapas de transformação +existentes no script principal da aplicação Spark +responsável por consolidar toda a aplicação de regras +de negócio utilizadas para o alcance dos objetivos +estabelecidos. + +OBJETIVO: +--------- +Consoldar uma suíte de testes capaz de testar e validar +todas as regras de negócio da aplicação materializadas +como métodos de transformação no script principal. +------------------------------------------------------ + +------------------------------------------------------ +---------- 1. PREPARAÇÃO INICIAL DO SCRIPT ----------- + 1.1 Importação das bibliotecas +---------------------------------------------------""" + +# Importando módulos para uso +from pytest import mark +from pyspark.sql.types import StructType, StructField,\ + StringType, IntegerType, DateType, TimestampType,\ + LongType, DecimalType, DoubleType + + +"""--------------------------------------------------- +------------ 2. DEFININDO SUÍTE DE TESTES ------------ + 2.1 Construindo testes unitários +---------------------------------------------------""" + + +@mark.main +@mark.orders +def test_qtd_linhas_resultantes_pos_transformacao_orders( + df_orders, df_orders_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_orders + W: quando o usuário executar o método transform_orders() + da classe GlueTransformationManager utilizando uma + amostra contendo 10 registros + T: então o DataFrame resultante deve manter a granularidade + e conter a mesma quantidade de 10 registros + """ + assert df_orders_prep.count() == df_orders.count() + + +@mark.main +@mark.orders +def test_schema_resultante_pos_transformacao_orders( + df_orders_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_orders + W: quando o usuário executar o método transform_orders() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("order_id", StringType()), + StructField("customer_id", StringType()), + StructField("order_status", StringType()), + StructField("order_purchase_timestamp", TimestampType()), + StructField("order_approved_at", TimestampType()), + StructField("order_delivered_carrier_date", TimestampType()), + StructField("order_delivered_customer_date", TimestampType()), + StructField("order_estimated_delivery_date", DateType()), + StructField("year_order_purchase_timestamp", IntegerType()), + StructField("quarter_order_purchase_timestamp", IntegerType()), + StructField("month_order_purchase_timestamp", IntegerType()), + StructField("dayofmonth_order_purchase_timestamp", IntegerType()), + StructField("dayofweek_order_purchase_timestamp", IntegerType()), + StructField("dayofyear_order_purchase_timestamp", IntegerType()), + StructField("weekofyear_order_purchase_timestamp", IntegerType()) + ]) + + assert df_orders_prep.schema == expected_schema + + +@mark.main +@mark.order_items +def test_qtd_linhas_resultantes_pos_transformacao_order_items( + df_order_items_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_order_items + W: quando o usuário executar o método transform_order_items() + da classe GlueTransformationManager utilizando uma + amostra contendo 14 registros + T: então o DataFrame resultante deve retornar uma base + agrupada contendo 10 registros + """ + assert df_order_items_prep.count() == 10 + + +@mark.main +@mark.order_items +def test_nao_duplicidade_de_order_id_pos_transformacao_order_items( + df_order_items_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_order_items + W: quando o usuário executar o método transform_order_items() + da classe GlueTransformationManager + T: então não deve haver nenhum tipo de duplicidade pela chave + order_id no DataFrame resultante + """ + + lines = df_order_items_prep.count() + lines_distinct = df_order_items_prep\ + .dropDuplicates(subset=["order_id"]).count() + + assert lines_distinct == lines + + +@mark.main +@mark.order_items +def test_schema_resultante_pos_transformacao_order_items( + df_order_items_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_order_items + W: quando o usuário executar o método transform_order_items() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("order_id", StringType()), + StructField("qty_order_items", LongType(), False), + StructField("sum_price_order", DecimalType(17, 2)), + StructField("avg_price_order", DecimalType(17, 2)), + StructField("min_price_order_item", DecimalType(17, 2)), + StructField("max_price_order_item", DecimalType(17, 2)), + StructField("avg_freight_value_order", DecimalType(17, 2)), + StructField("max_order_shipping_limit_date", TimestampType()), + ]) + + assert df_order_items_prep.schema == expected_schema + + +@mark.main +@mark.customers +def test_qtd_linhas_resultantes_pos_transformacao_customers( + df_customers, df_customers_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_customers + W: quando o usuário executar o método transform_customers() + da classe GlueTransformationManager utilizando uma + amostra contendo 10 registros + T: então o DataFrame resultante deve manter a granularidade + e conter a mesma quantidade de 10 registros + """ + assert df_customers_prep.count() == df_customers.count() + + +@mark.main +@mark.customers +def test_nao_duplicidade_de_customer_id_pos_transformacao_customers( + df_customers_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_customers + W: quando o usuário executar o método transform_customers() + da classe GlueTransformationManager + T: então não deve haver nenhum tipo de duplicidade pela chave + customer_id no DataFrame resultante + """ + + lines = df_customers_prep.count() + lines_distinct = df_customers_prep\ + .dropDuplicates(subset=["customer_id"]).count() + + assert lines_distinct == lines + + +@mark.main +@mark.customers +def test_schema_resultante_pos_transformacao_customers( + df_customers_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_customers + W: quando o usuário executar o método transform_customers() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("customer_id", StringType()), + StructField("customer_city", StringType()), + StructField("customer_state", StringType()) + ]) + + assert df_customers_prep.schema == expected_schema + + +@mark.main +@mark.payments +def test_qtd_linhas_resultantes_pos_transformacao_payments( + df_payments_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_payments + W: quando o usuário executar o método transform_payments() + da classe GlueTransformationManager utilizando uma + amostra contendo 10 registros + T: então o DataFrame resultante deve retornar uma base + agrupada contendo 10 registros + """ + assert df_payments_prep.count() == 10 + + +@mark.main +@mark.payments +def test_nao_duplicidade_de_order_id_pos_transformacao_payments( + df_payments_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_payments + W: quando o usuário executar o método transform_payments() + da classe GlueTransformationManager + T: então não deve haver nenhum tipo de duplicidade pela chave + order_id no DataFrame resultante + """ + + lines = df_payments_prep.count() + lines_distinct = df_payments_prep\ + .dropDuplicates(subset=["order_id"]).count() + + assert lines_distinct == lines + + +@mark.main +@mark.payments +def test_schema_resultante_pos_transformacao_payments( + df_payments_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_payments + W: quando o usuário executar o método transform_customers() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("order_id", StringType()), + StructField("installments", LongType(), False), + StructField("sum_payments", DoubleType()), + StructField("avg_payment_value", DoubleType()), + StructField("distinct_payment_types", LongType(), False), + StructField("most_common_payment_type", StringType()) + ]) + + assert df_payments_prep.schema == expected_schema + + +@mark.main +@mark.reviews +def test_qtd_linhas_resultantes_pos_transformacao_reviews( + df_reviews_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_reviews + W: quando o usuário executar o método transform_reviews() + da classe GlueTransformationManager utilizando uma + amostra contendo 10 registros + T: então o DataFrame resultante deve retornar uma base + agrupada contendo 10 registros + """ + assert df_reviews_prep.count() == 10 + + +@mark.main +@mark.reviews +def test_nao_duplicidade_de_order_id_pos_transformacao_reviews( + df_reviews_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_reviews + W: quando o usuário executar o método transform_reviews() + da classe GlueTransformationManager + T: então não deve haver nenhum tipo de duplicidade pela chave + order_id no DataFrame resultante + """ + + lines = df_reviews_prep.count() + lines_distinct = df_reviews_prep\ + .dropDuplicates(subset=["order_id"]).count() + + assert lines_distinct == lines + + +@mark.main +@mark.reviews +def test_schema_resultante_pos_transformacao_reviews( + df_reviews_prep +): + """ + G: dado que o usuário deseja transformar dados presentes + no DataFrame df_payments + W: quando o usuário executar o método transform_customers() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("order_id", StringType()), + StructField("review_best_score", IntegerType()), + StructField("review_comment_message", StringType()) + ]) + + assert df_reviews_prep.schema == expected_schema + + +@mark.main +@mark.sot +def test_qtd_linhas_resultantes_pos_transformacao_sot( + df_sot_prep +): + """ + G: dado que o usuário deseja obter a transformação final + após as transformações individuais de suas roigens + W: quando o usuário executar o método transform_sot() + da classe GlueTransformationManager utilizando uma + amostra contendo 10 registros + T: então o DataFrame resultante deve retornar uma base + agrupada contendo 10 registros + """ + assert df_sot_prep.count() == 10 + + +@mark.main +@mark.sot +def test_nao_duplicidade_de_order_id_pos_transformacao_sot( + df_sot_prep +): + """ + G: dado que o usuário deseja obter a transformação final + após as transformações individuais de suas roigens + W: quando o usuário executar o método transform_sot() + da classe GlueTransformationManager + T: então não deve haver nenhum tipo de duplicidade pela chave + order_id no DataFrame resultante + """ + + lines = df_sot_prep.count() + lines_distinct = df_sot_prep\ + .dropDuplicates(subset=["order_id"]).count() + + assert lines_distinct == lines + + +@mark.main +@mark.sot +def test_schema_resultante_pos_transformacao_sot( + df_sot_prep +): + """ + G: dado que o usuário deseja obter a transformação final + após as transformações individuais de suas roigens + W: quando o usuário executar o método transform_sot() + da classe GlueTransformationManager + T: então o DataFrame resultante deve conter um conjunto + esperado de atributos e tipos primitivos + """ + + # Schema esperado + expected_schema = StructType([ + StructField("order_id", StringType()), + StructField("customer_id", StringType()), + StructField("order_status", StringType()), + StructField("order_purchase_timestamp", TimestampType()), + StructField("order_approved_at", TimestampType()), + StructField("order_delivered_carrier_date", TimestampType()), + StructField("order_delivered_customer_date", TimestampType()), + StructField("order_estimated_delivery_date", DateType()), + StructField("year_order_purchase_timestamp", IntegerType()), + StructField("quarter_order_purchase_timestamp", IntegerType()), + StructField("month_order_purchase_timestamp", IntegerType()), + StructField("dayofmonth_order_purchase_timestamp", IntegerType()), + StructField("dayofweek_order_purchase_timestamp", IntegerType()), + StructField("dayofyear_order_purchase_timestamp", IntegerType()), + StructField("weekofyear_order_purchase_timestamp", IntegerType()), + StructField("qty_order_items", LongType(), False), + StructField("sum_price_order", DecimalType(17, 2)), + StructField("avg_price_order", DecimalType(17, 2)), + StructField("min_price_order_item", DecimalType(17, 2)), + StructField("max_price_order_item", DecimalType(17, 2)), + StructField("avg_freight_value_order", DecimalType(17, 2)), + StructField("max_order_shipping_limit_date", TimestampType()), + StructField("customer_city", StringType()), + StructField("customer_state", StringType()), + StructField("installments", LongType()), + StructField("sum_payments", DoubleType()), + StructField("avg_payment_value", DoubleType()), + StructField("distinct_payment_types", LongType()), + StructField("most_common_payment_type", StringType()), + StructField("review_best_score", IntegerType()), + StructField("review_comment_message", StringType()) + ]) + + df_sot_prep.printSchema() + + assert df_sot_prep.schema == expected_schema