diff --git a/DESCRIPTION b/DESCRIPTION index 7a0a9c3..cecf18f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: xafty Title: Lightweight package to easily validate a table -Version: 1.3.0 +Version: 1.4.0 Author: David Crone Maintainer: The package maintainer Description: The package is designed to facilitate task automation for diff --git a/NAMESPACE b/NAMESPACE index 564f887..556d44e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(add_regex_columns_to_validity) export(align_column_types) export(as.Date_xafty) export(as.POSIXct_xafty) +export(as.numeric_xafty) export(build_xafty_list) export(build_xafty_test_table) export(check_column_exactinput) diff --git a/R/check_column_types.R b/R/check_column_types.R index 24bf3d8..7dfd7bb 100644 --- a/R/check_column_types.R +++ b/R/check_column_types.R @@ -79,12 +79,21 @@ check_column_types <- function(check_table, validity_table, simply = FALSE) { #' @param validity_table Data table. A validity table that holds the class information for alignment #' @param date_origin Character. A date string for date conversion giving the number of days since e.g. "1900-01-01". This #' is only necessary if the excel date is stored as numeric: (32768, 35981). For more information see: ?as.Date -#' @param tryFormats Character vector. Date formats that should be use to try to convert to date +#' @param tryFormats_Date Character vector. Date formats that should be use to try to convert to date +#' @param tryFormats_POSIXct Character vector. POSIXct formats that should be use to try to convert to POSIXct #' @param tz Timezone for the POSIXct values. Default is UTC +#' @param force_type Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +#' the function keeps the column as is. #' @export -align_column_types <- function(check_table, validity_table, +align_column_types <- function(check_table, validity_table, force_type = TRUE, date_origin = "1899-12-30", - tryFormats = c("%d.%m.%Y", "%d/%m/%Y", "%Y-%m-%d", "%Y/%m/%d"), + tryFormats_Date = c("%d.%m.%Y", "%d/%m/%Y", "%Y-%m-%d", "%Y/%m/%d"), + tryFormats_POSIXct = c("%Y-%m-%d %H:%M:%OS", + "%Y/%m/%d %H:%M:%OS", + "%Y-%m-%d %H:%M", + "%Y/%m/%d %H:%M", + "%Y-%m-%d", + "%Y/%m/%d"), tz = "") { xafty_syntax <- "##!!" possible_classes <- c("text", "date", "number", "factor", "datetime") @@ -102,11 +111,12 @@ align_column_types <- function(check_table, validity_table, switch(xafty_data_type, "##!!text" = check_table[, i] <- as.character(check_table[[i]]), "##!!date" = check_table[, i] <- as.Date_xafty(check_table[[i]], - date_origin = date_origin, tryFormats = tryFormats + force_type = force_type, date_origin = date_origin, tryFormats = tryFormats_Date ), - "##!!number" = check_table[, i] <- as.numeric(check_table[[i]]), + "##!!number" = check_table[, i] <- as.numeric_xafty(check_table[[i]], force_type = force_type), "##!!factor" = check_table[, i] <- as.factor(check_table[[i]]), - "##!!datetime" = check_table[, i] <- as.POSIXct_xafty(check_table[[i]], tz = tz) + "##!!datetime" = check_table[, i] <- as.POSIXct_xafty(check_table[[i]], force_type = force_type, + tryFormats = tryFormats_POSIXct, tz = tz) ) } } diff --git a/R/utils.R b/R/utils.R index 6148ca9..8c5bdf0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -184,28 +184,65 @@ is.Date_xafty <- function(dates, date_origin = "1899-12-30", tryFormats = c("%Y- #' @param dates Character vector of Dates to be Converted #' @param date_origin Character. The date from which numeric dates will be converted into ISO-Date format #' @param tryFormats Character vector. Date formats that should be use to try to convert to date +#' @param force_type Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +#' the function keeps the column as is. #' @return An equally length date vector, \code{NA} when the value could not be converted to date #' @export -as.Date_xafty <- function(dates, date_origin = "1899-12-30", tryFormats = c("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%Y/%m/%d")) { - xafty_column <- sapply(dates, \(date) { - tryCatch( - { - as.Date(date, tryFormats = tryFormats) - }, - error = function(e) { - numeric_date <- suppressWarnings(as.numeric(date)) - if (is.na(numeric_date)) { - return(NA) +as.Date_xafty <- function(dates, date_origin = "1899-12-30", + force_type = TRUE, tryFormats = c("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%Y/%m/%d")) { + + if (force_type) { + xafty_column <- sapply(dates, \(date) { + tryCatch( + { + as.Date(date, tryFormats = tryFormats) + }, + error = function(e) { + numeric_date <- suppressWarnings(as.numeric(date)) + if (is.na(numeric_date)) { + return(NA) + } + + as.Date(numeric_date, origin = date_origin) } + ) + }) - as.Date(numeric_date, origin = date_origin) - } - ) - }) + names(xafty_column) <- NULL + + as.Date(xafty_column, origin = "1970-01-01") + + } else { + + na_position <- which(is.na(dates)) + xafty_column <- sapply(dates, \(date) { + tryCatch( + { + as.Date(date, tryFormats = tryFormats) + }, + error = function(e) { + numeric_date <- suppressWarnings(as.numeric(date)) + if (is.na(numeric_date)) { + return(NA) + } + + as.Date(numeric_date, origin = date_origin) + } + ) + }) + + na_position_after <- which(is.na(xafty_column)) + + if(sum(na_position_after) > sum(na_position)) { + return(dates) + } else { + names(xafty_column) <- NULL + as.Date(xafty_column, origin = "1970-01-01") + } + + } - names(xafty_column) <- NULL - as.Date(xafty_column, origin = "1970-01-01") } #' @title Check if Passed Values can be Parsed as Numeric @@ -225,6 +262,26 @@ is.numeric_xafty <- function(numbers) { xafty_column } +#' @title Check if Passed Values can be Parsed as Numeric +#' @param numbers Character vector of Numbers to be converted +#' @param force_type Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +#' the function keeps the column as is. +#' @return An equally length numeric vector if the numerics could be successfully converted to NA. +#' @export +as.numeric_xafty <- function(numbers, force_type = TRUE) { + + position_na <- which(is.na(numbers)) + + position_na_conversion <- which(is.na(suppressWarnings(as.numeric(numbers)))) + + if(sum(position_na_conversion) > sum(position_na) & !force_type) { + return(numbers) + } else { + as.numeric(numbers) + } + +} + #' @title Check if Passed Values can be Parsed as POSIXct #' @param datetimes Character vector of date time values to be parsed #' @param tz Timezone for the POSIXct values. Default is UTC @@ -252,22 +309,59 @@ is.POSIXct_xafty <- function(datetimes, tz = "") { #' @title Convert Passed Values to POSIXct #' @param datetimes Character vector of date time values to be parsed #' @param tz Timezone for the POSIXct values. Default is UTC +#' @param force_type Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +#' the function keeps the column as is. +#' @param tryFormats Character vector. POSIXct formats that should be use to try to convert to POSIXct +#' the function keeps the column as is #' @export -as.POSIXct_xafty <- function(datetimes, tz = "") { - xafty_column <- sapply(datetimes, \(datetime) { - tryCatch( - { - as.POSIXct(datetime, tz = tz) - }, - error = function(e) { - NA - } - ) - }) +as.POSIXct_xafty <- function(datetimes, force_type = TRUE, + tryFormats = c("%Y-%m-%d %H:%M:%OS", + "%Y/%m/%d %H:%M:%OS", + "%Y-%m-%d %H:%M", + "%Y/%m/%d %H:%M", + "%Y-%m-%d", + "%Y/%m/%d"), + tz = "") { + + if (force_type) { + xafty_column <- sapply(datetimes, \(datetime) { + tryCatch( + { + as.POSIXct(datetime, tz = tz) + }, + error = function(e) { + NA + } + ) + }) - names(xafty_column) <- NULL + names(xafty_column) <- NULL + as.POSIXct(xafty_column, tryFormats = tryFormats) + + } else { + na_position <- which(is.na(datetimes)) + + xafty_column <- sapply(datetimes, \(datetime) { + tryCatch( + { + as.POSIXct(datetime, tz = tz) + }, + error = function(e) { + NA + } + ) + }) + + na_position_after <- which(is.na(xafty_column)) + + if(sum(na_position_after) > sum(na_position)) { + return(datetimes) + } else { + names(xafty_column) <- NULL + as.POSIXct(xafty_column, tryFormats = tryFormats) + } + } - as.POSIXct(xafty_column) } #' @title Build a List of Test Results @@ -277,6 +371,7 @@ as.POSIXct_xafty <- function(datetimes, tz = "") { #' #' @param check_table Data Frame. The table that will be checked against the specified rules in the validity table. #' @param validity_table Data Frame. A validation table that stores the rules that the check table will be checked against. +#' @param align_columns Boolean. Whether to align column types before checking for xafty rules. Does not coerce values to NA! #' @param meta_tests_name Character. Name of the list item that stores all meta_tests. The parameter is there to help avoid naming #' conflicts with column names from the check table or validity table. #' @param check_names Boolean. Adds a meta test that checks whether all column names of the validity table are present in @@ -285,12 +380,17 @@ as.POSIXct_xafty <- function(datetimes, tz = "") { #' larger than the columns in the validity table. #' @returns A list. #' @export -build_xafty_list <- function(check_table, validity_table, +build_xafty_list <- function(check_table, validity_table, align_columns = TRUE, meta_tests_name = "meta_tests", check_names = TRUE, check_number = TRUE) { validity_table <- add_regex_columns_to_validity(check_table = check_table, validity_table = validity_table, multiple = "remove") + if(align_columns) { + check_table <- align_column_types(check_table = check_table, validity_table = validity_table, + force_type = FALSE) + } + base_column_list <- list() if (check_names || check_number) { diff --git a/R/xafty_rules.R b/R/xafty_rules.R index 91f51dc..f65814c 100644 --- a/R/xafty_rules.R +++ b/R/xafty_rules.R @@ -9,6 +9,7 @@ #' \item{description}{A character vector of rule description in prosa.} #' \item{check_function}{A list of functions for check purposes.} #' \item{filter_function}{A list of functions to filter values.} +#' \item{change_type_function}{A list of functions change data to appropriate types.} #' } #' #' @source The data is curated by xafty developers and grows as new rules are added. diff --git a/man/align_column_types.Rd b/man/align_column_types.Rd index d6a03f8..a1b854f 100644 --- a/man/align_column_types.Rd +++ b/man/align_column_types.Rd @@ -7,8 +7,11 @@ align_column_types( check_table, validity_table, + force_type = TRUE, date_origin = "1899-12-30", - tryFormats = c("\%d.\%m.\%Y", "\%d/\%m/\%Y", "\%Y-\%m-\%d", "\%Y/\%m/\%d"), + tryFormats_Date = c("\%d.\%m.\%Y", "\%d/\%m/\%Y", "\%Y-\%m-\%d", "\%Y/\%m/\%d"), + tryFormats_POSIXct = c("\%Y-\%m-\%d \%H:\%M:\%OS", "\%Y/\%m/\%d \%H:\%M:\%OS", + "\%Y-\%m-\%d \%H:\%M", "\%Y/\%m/\%d \%H:\%M", "\%Y-\%m-\%d", "\%Y/\%m/\%d"), tz = "" ) } @@ -17,10 +20,15 @@ align_column_types( \item{validity_table}{Data table. A validity table that holds the class information for alignment} +\item{force_type}{Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +the function keeps the column as is.} + \item{date_origin}{Character. A date string for date conversion giving the number of days since e.g. "1900-01-01". This is only necessary if the excel date is stored as numeric: (32768, 35981). For more information see: ?as.Date} -\item{tryFormats}{Character vector. Date formats that should be use to try to convert to date} +\item{tryFormats_Date}{Character vector. Date formats that should be use to try to convert to date} + +\item{tryFormats_POSIXct}{Character vector. POSIXct formats that should be use to try to convert to POSIXct} \item{tz}{Timezone for the POSIXct values. Default is UTC} } diff --git a/man/as.Date_xafty.Rd b/man/as.Date_xafty.Rd index 6fa9785..210b47c 100644 --- a/man/as.Date_xafty.Rd +++ b/man/as.Date_xafty.Rd @@ -7,6 +7,7 @@ as.Date_xafty( dates, date_origin = "1899-12-30", + force_type = TRUE, tryFormats = c("\%Y-\%m-\%d", "\%d.\%m.\%Y", "\%d/\%m/\%Y", "\%Y/\%m/\%d") ) } @@ -15,6 +16,9 @@ as.Date_xafty( \item{date_origin}{Character. The date from which numeric dates will be converted into ISO-Date format} +\item{force_type}{Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +the function keeps the column as is.} + \item{tryFormats}{Character vector. Date formats that should be use to try to convert to date} } \value{ diff --git a/man/as.POSIXct_xafty.Rd b/man/as.POSIXct_xafty.Rd index 023a841..bfe626a 100644 --- a/man/as.POSIXct_xafty.Rd +++ b/man/as.POSIXct_xafty.Rd @@ -4,11 +4,23 @@ \alias{as.POSIXct_xafty} \title{Convert Passed Values to POSIXct} \usage{ -as.POSIXct_xafty(datetimes, tz = "") +as.POSIXct_xafty( + datetimes, + force_type = TRUE, + tryFormats = c("\%Y-\%m-\%d \%H:\%M:\%OS", "\%Y/\%m/\%d \%H:\%M:\%OS", + "\%Y-\%m-\%d \%H:\%M", "\%Y/\%m/\%d \%H:\%M", "\%Y-\%m-\%d", "\%Y/\%m/\%d"), + tz = "" +) } \arguments{ \item{datetimes}{Character vector of date time values to be parsed} +\item{force_type}{Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +the function keeps the column as is.} + +\item{tryFormats}{Character vector. POSIXct formats that should be use to try to convert to POSIXct +the function keeps the column as is} + \item{tz}{Timezone for the POSIXct values. Default is UTC} } \description{ diff --git a/man/as.numeric_xafty.Rd b/man/as.numeric_xafty.Rd new file mode 100644 index 0000000..0737fe7 --- /dev/null +++ b/man/as.numeric_xafty.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{as.numeric_xafty} +\alias{as.numeric_xafty} +\title{Check if Passed Values can be Parsed as Numeric} +\usage{ +as.numeric_xafty(numbers, force_type = TRUE) +} +\arguments{ +\item{numbers}{Character vector of Numbers to be converted} + +\item{force_type}{Boolean. Whether to force the type conversion even if it introduces NAs during type conversion. If TRUE +the function keeps the column as is.} +} +\value{ +An equally length numeric vector if the numerics could be successfully converted to NA. +} +\description{ +Check if Passed Values can be Parsed as Numeric +} diff --git a/man/build_xafty_list.Rd b/man/build_xafty_list.Rd index e6fe844..f9fb5a0 100644 --- a/man/build_xafty_list.Rd +++ b/man/build_xafty_list.Rd @@ -7,6 +7,7 @@ build_xafty_list( check_table, validity_table, + align_columns = TRUE, meta_tests_name = "meta_tests", check_names = TRUE, check_number = TRUE @@ -17,6 +18,8 @@ build_xafty_list( \item{validity_table}{Data Frame. A validation table that stores the rules that the check table will be checked against.} +\item{align_columns}{Boolean. Whether to align column types before checking for xafty rules. Does not coerce values to NA!} + \item{meta_tests_name}{Character. Name of the list item that stores all meta_tests. The parameter is there to help avoid naming conflicts with column names from the check table or validity table.} diff --git a/man/xafty_rules_table.Rd b/man/xafty_rules_table.Rd index 405781c..cc6ae06 100644 --- a/man/xafty_rules_table.Rd +++ b/man/xafty_rules_table.Rd @@ -12,6 +12,7 @@ A data frame with 6 columns: \item{description}{A character vector of rule description in prosa.} \item{check_function}{A list of functions for check purposes.} \item{filter_function}{A list of functions to filter values.} + \item{change_type_function}{A list of functions change data to appropriate types.} } } \source{ diff --git a/tests/testthat/test-align_column_types.R b/tests/testthat/test-align_column_types.R new file mode 100644 index 0000000..081e377 --- /dev/null +++ b/tests/testthat/test-align_column_types.R @@ -0,0 +1,94 @@ +test_that("Align column types changes columns to the specified type in the validity table", { + + check_table <- data.frame( + "Product_Name" = c("Apple", "Banana", "Apple"), + "Product_Weight" = c("2.1", "0.5", "1.0"), + "Expiration_Date" = c("2022-02-01", "2021-04-25", "2025-03-02"), + "Delivery_Time" = c("2023-09-12 22:12:01", "2023-09-17 22:12:01", "2023-09-12 01:12:01"), + "Is_Delivered_By" = c("zzz__Train", "Train no Rails", "Truck"), + "Mail_Customer" = c("applelover@yahoo.com", "banana_digester@bananas.uk", "grrm@asoiaf.com"), + "Date_Arrival" = c("102203", "102202", "102201"), + "W123" = c("1", "1", "1"), + "W421" = c("1", "0", "0"), + "T333" = c("1.1", "2", "0") + ) + + validity_table <- data.frame( + "Product_Name" = c("##!!text", "##!!eachexact", "Apple", "Banana", "##!!notempty"), + "Product_Weight" = c("##!!number", NA, NA, NA, NA), + "Expiration_Date" = c("##!!date", "##!!eachexact", "2022-02-01", NA, NA), + "Delivery_Time" = c("##!!datetime"), + "Is_Delivered_By" = c("##!!text", "##!!rowpattern", "Train", "Truck", "##!!notempty"), + "Mail_Customer" = c("##!!text", "##!!strictpattern", "@", NA, NA), + "Date_Arrival" = c("##!!date", NA, NA, NA, NA), + "Wagon_Design" = c("##!!number", "##!!regexcolumns", "^W[1-9]", "333", "##!!notempty") + ) + + validity_table <- add_regex_columns_to_validity(check_table, validity_table) + + check_table_aligned <- align_column_types(check_table, validity_table, force_type = TRUE) + + column_types <- do.call(c ,sapply(check_table_aligned, class, simplify = TRUE)) + names(column_types) <- NULL + + expect_equal(column_types, c("character", "numeric", "Date", "POSIXct", "POSIXt", "character", "character", "Date", + "numeric", "numeric", "numeric")) + +}) + +test_that("align_column_types does not coerce values to NA if force_type is FALSE", { + + check_table <- data.frame( + "Product_Name" = c("Apple", "Banana", "Apple"), + "Product_Weight" = c("2.1", "a0.5", "1.0"), + "Expiration_Date" = c("2022.02-01", "2021-04-25", "2025-03-02"), + "Delivery_Time" = c("2023-09-12 22:12:01", "2023-09.17 22:12:01", "2023-09-12 01:12:01") + ) + + validity_table <- data.frame( + "Product_Name" = c("##!!factor", "##!!eachexact", "Apple", "Banana", "##!!notempty"), + "Product_Weight" = c("##!!number", NA, NA, NA, NA), + "Expiration_Date" = c("##!!date", "##!!eachexact", "2022-02-01", NA, NA), + "Delivery_Time" = c("##!!datetime") + ) + + check_table_aligned <- align_column_types(check_table, validity_table, force_type = FALSE) + + column_types <- sapply(check_table_aligned, class) + names(column_types) <- NULL + + expect_equal(column_types, c("factor", "character", "character", "character")) + expect_equal(check_table$Product_Weight, check_table_aligned$Product_Weight) + expect_equal(check_table$Expiration_Date, check_table_aligned$Expiration_Date) + expect_equal(check_table$Delivery_Time, check_table_aligned$Delivery_Time) + +}) + +test_that("align_column_types does coerce values to NA if force_type is TRUE", { + + check_table <- data.frame( + "Product_Name" = c("Apple", "Banana", "Apple"), + "Product_Weight" = c("2.1", "a0.5", "1.0"), + "Expiration_Date" = c("2022.02-01", "2021-04-25", "2025-03-02"), + "Delivery_Time" = c("2023-09-12 22:12:01", "2023-09.17 22:12:01", "2023-09-12 01:12:01") + ) + + validity_table <- data.frame( + "Product_Name" = c("##!!factor", "##!!eachexact", "Apple", "Banana", "##!!notempty"), + "Product_Weight" = c("##!!number", NA, NA, NA, NA), + "Expiration_Date" = c("##!!date", "##!!eachexact", "2022-02-01", NA, NA), + "Delivery_Time" = c("##!!datetime") + ) + + check_table_aligned <- suppressWarnings(align_column_types(check_table, validity_table, force_type = TRUE)) + + column_types <- do.call(c, sapply(check_table_aligned, class)) + names(column_types) <- NULL + + expect_equal(column_types, c("factor", "numeric", "Date", "POSIXct", "POSIXt")) + expect_equal(as.character(check_table_aligned$Product_Name), check_table$Product_Name) + expect_equal(as.character(check_table_aligned$Product_Weight), c("2.1", NA, "1")) + expect_equal(as.character(check_table_aligned$Expiration_Date), c(NA, "2021-04-25", "2025-03-02")) + expect_equal(as.character(check_table_aligned$Delivery_Time), c("2023-09-12 22:12:01", NA, "2023-09-12 01:12:01")) + +})