Merge pull request #210 from njtierney/nabular

implement the nabular class, fixes #192
njtierney · Sep 3, 2018 · 885de6f · 885de6f
2 parents 682c27f + 2b9339f
commit 885de6f
Show file tree

Hide file tree

Showing 25 changed files with 232 additions and 25 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: naniar
 Type: Package
 Title: Data Structures, Summaries, and Visualisations for Missing Data
-Version: 0.3.10.9000
+Version: 0.3.10.9100
 Authors@R: c(
   person("Nicholas", "Tierney", 
          role = c("aut", "cre"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -41,6 +41,7 @@ export(all_na)
 export(any_complete)
 export(any_miss)
 export(any_na)
+export(any_shade)
 export(are_na)
 export(are_shade)
 export(as_shadow)
@@ -73,6 +74,7 @@ export(impute_mean_all)
 export(impute_mean_at)
 export(impute_mean_if)
 export(is_na)
+export(is_nabular)
 export(is_shade)
 export(is_shadow)
 export(label_miss_1d)
@@ -100,6 +102,7 @@ export(n_miss)
 export(n_miss_row)
 export(n_var_complete)
 export(n_var_miss)
+export(nabular)
 export(new_nabular)
 export(new_shadow)
 export(pct_complete)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,13 @@
-# naniar 0.3.10.9000 (2018/08/20)
+# naniar 0.3.10.9100 (2018/09/03)
+
+## New Feature
+
+* `any_shade()` returns a logical TRUE or FALSE depending on if there are any `shade` values
+* `nabular()` an alias for `bind_shadow()` to tie the `nabular` term into the work.
+* `is_nabular()` checks if input is nabular.
+
+
+# naniar 0.3.10.9000 (2018/08/31)
 
 ## New Feature
 

diff --git a/R/nabular.R b/R/nabular.R
@@ -6,7 +6,31 @@
 #' @export
 new_nabular <- function(x){
 
-  # include tests for checking that the data contains a shadow
-
+  # is there some mixture of shade and regular data?
+  if (sum(are_shade(x) == ncol(x)) | !any_shade(x)) {
+    rlang::abort(message = "data must have shadow data with the regular data")
+  }
   tibble::new_tibble(x, subclass = "nabular")
 }
+
+#' Convert data into nabular form by binding shade to it
+#'
+#' Binding a shadow matrix to a regular dataframe converts it into nabular data,
+#'   which makes it easier to visualise and work with missing data.
+#'
+#' @param data a dataframe
+#' @param only_miss logical - if FALSE (default) it will bind a dataframe with
+#'     all of the variables duplicated with their shadow. Setting this to TRUE
+#'     will bind variables only those variables that contain missing values.
+#'     See the examples for more details.
+#' @param ... extra options to pass to [recode_shadow()] - a work in progress.
+#'
+#' @return data with the added variable shifted and the suffix `_NA`
+#' @export
+#' @seealso [bind_shadow()]
+#'
+#' @examples
+#' @export
+nabular <- function(data, only_miss = FALSE, ...){
+  bind_shadow(data = data, only_miss = only_miss, ...)
+}
diff --git a/R/shade.R b/R/shade.R
@@ -33,6 +33,16 @@ new_shade <- function(x, extra_levels = NULL){
 #' xs <- shade(c(NA, 1, 2, "3"))
 #'
 #' is_shade(xs)
+#' are_shade(xs)
+#' any_shade(xs)
+#'
+#' aq_s <- as_shadow(airquality)
+#'
+#' is_shade(aq_s)
+#' are_shade(aq_s)
+#' any_shade(aq_s)
+#' any_shade(airquality)
+#'
 #'
 is_shade <- function(x){
   inherits(x, "shade")
@@ -86,7 +96,7 @@ shade <- function(x, ..., extra_levels = NULL){
   test_if_null(x)
 
   if (length(x) == 0) {
-    rlang::abort(msg = "input to shade must have length > 0")
+    rlang::abort(message = "input to shade must have length > 0")
   }
 
   # if no other levels are specified
@@ -132,3 +142,9 @@ shade <- function(x, ..., extra_levels = NULL){
   # and return a new shade value
   new_shade(x, extra_levels)
 }
+
+#' @export
+#' @rdname is_shade
+any_shade <- function(x){
+  any(are_shade(x))
+}
diff --git a/R/shadow-verifiers.R b/R/shadow-verifiers.R
@@ -16,10 +16,17 @@
 #' is_shadow(aq_sh)
 #' is_shadow(airquality)
 #' is_shadow(aq_bind)
+#' is_nabular(aq_bind)
 #'
 #' @export
 #' @name is_shadow
-
 is_shadow <- function(x){
   inherits(x, "shadow")
 }
+
+#' @export
+#' @rdname is_shadow
+is_nabular <- function(x){
+  inherits(x, "nabular")
+}
+
diff --git a/R/shadows.R b/R/shadows.R
@@ -131,7 +131,8 @@ bind_shadow <- function(data, only_miss = FALSE, ...){
 
     # class(shadow_data) <- c("shadow", class(shadow_data))
 
-    return(new_shadow(shadow_data))
+    # return(new_shadow(shadow_data))
+    return(new_nabular(shadow_data))
 
   # if you want All the values to be added (the default behaviour)
   }
@@ -150,7 +151,8 @@ bind_shadow <- function(data, only_miss = FALSE, ...){
 
     # class(shadow_data) <- c("shadow", class(shadow_data))
 
-    return(new_shadow(shadow_data))
+    # return(new_shadow(shadow_data))
+    return(new_nabular(shadow_data))
 
   }
 

diff --git a/README.Rmd b/README.Rmd
@@ -20,14 +20,14 @@ knitr::opts_chunk$set(
 `naniar` provides principled, tidy ways to summarise, visualise, and manipulate missing data with minimal deviations from the workflows in ggplot2 and tidy data. It does this by providing:
 
 - Shadow matrices, a tidy data structure for missing data:
-    - `as_shadow()` and `bind_shadow()`
+    - `bind_shadow()` and `nabular()`
 - Shorthand summaries for missing data: 
     - `n_miss()` and `n_complete()`
     - `pct_miss()`and `pct_complete()`
 - Numerical summaries of missing data in variables and cases:
     - `miss_var_summary()` and `miss_var_table()`
     - `miss_case_summary()`, `miss_case_table()`
-- Visualisation methods:
+- Visualisation for missing data:
     - `geom_miss_point()`
     - `gg_miss_var()`
     - `gg_miss_case()`
@@ -54,7 +54,7 @@ remotes::install_github("njtierney/naniar")
 
 # A short overview of naniar
 
-Visualising missing data might sound a little strange - how do you visualise something that is not there?  One approach to visualising missing data comes from [ggobi](http://www.ggobi.org/) and [manet](https://www.swmath.org/software/3067), where we replace "NA" values with values 10% lower than the minimum value in that variable. This visualisation is provided with the `geom_miss_point()` ggplot2 geom - which we illustrate by exploring the relationship between Ozone and Solar radiation from the airquality dataset.
+Visualising missing data might sound a little strange - how do you visualise something that is not there? One approach to visualising missing data comes from [ggobi](http://www.ggobi.org/) and [manet](https://www.swmath.org/software/3067), which replaces `NA` values with values 10% lower than the minimum value in that variable. This visualisation is provided with the `geom_miss_point()` ggplot2 geom - which we illustrate by exploring the relationship between Ozone and Solar radiation from the airquality dataset.
 
 ```{r regular-geom-point}
 
@@ -69,7 +69,7 @@ ggplot(data = airquality,
 
 ggplot2 does not handle these missing values, and we get a warning message about the missing values.
 
-We can instead use the `geom_miss_point()` to display the missing data
+We can instead use `geom_miss_point()` to display the missing data
 
 ```{r geom-miss-point}
 
@@ -110,8 +110,14 @@ as_shadow(airquality)
 
 ```
 
+Binding the shadow data to the data you help keep better track of the missing values. This format is called "nabular", a portmanteau of `NA` and `tabular`. You can bind the shadow to the data using `bind_shadow` or `nabular`:
 
-Using the shadow matrix helps you manage where missing values are in your dataset and make it easy to do visualisations where you split by missingness:
+```{r show-nabular}
+bind_shadow(airquality)
+nabular(airquality)
+```
+
+Using the nabular format helps you manage where missing values are in your dataset and make it easy to do visualisations where you split by missingness:
 
 ```{r shadow-w-ggplot}
 
@@ -125,7 +131,6 @@ airquality %>%
 
 And even visualise imputations 
 
-
 ```{r shadow-impute}
 
 airquality %>%
@@ -165,7 +170,6 @@ gg_miss_span(pedestrian,
              span_every = 1500)
 ```
 
-
 You can read about all of the visualisations in naniar in the vignette [Gallery of missing data visualisations using naniar](http://naniar.njtierney.com/articles/naniar-visualisation.html).
 
 naniar also provides handy helpers for calculating the number, proportion, and percentage of missing and complete observations:

diff --git a/README.md b/README.md
@@ -20,14 +20,14 @@ manipulate missing data with minimal deviations from the workflows in
 ggplot2 and tidy data. It does this by providing:
 
   - Shadow matrices, a tidy data structure for missing data:
-      - `as_shadow()` and `bind_shadow()`
+      - `bind_shadow()` and `nabular()`
   - Shorthand summaries for missing data:
       - `n_miss()` and `n_complete()`
       - `pct_miss()`and `pct_complete()`
   - Numerical summaries of missing data in variables and cases:
       - `miss_var_summary()` and `miss_var_table()`
       - `miss_case_summary()`, `miss_case_table()`
-  - Visualisation methods:
+  - Visualisation for missing data:
       - `geom_miss_point()`
       - `gg_miss_var()`
       - `gg_miss_case()`
@@ -61,7 +61,7 @@ remotes::install_github("njtierney/naniar")
 Visualising missing data might sound a little strange - how do you
 visualise something that is not there? One approach to visualising
 missing data comes from [ggobi](http://www.ggobi.org/) and
-[manet](https://www.swmath.org/software/3067), where we replace “NA”
+[manet](https://www.swmath.org/software/3067), which replaces `NA`
 values with values 10% lower than the minimum value in that variable.
 This visualisation is provided with the `geom_miss_point()` ggplot2 geom
 - which we illustrate by exploring the relationship between Ozone and
@@ -83,7 +83,7 @@ ggplot(data = airquality,
 ggplot2 does not handle these missing values, and we get a warning
 message about the missing values.
 
-We can instead use the `geom_miss_point()` to display the missing data
+We can instead use `geom_miss_point()` to display the missing data
 
 ``` r
 
@@ -156,7 +156,47 @@ as_shadow(airquality)
 #> # ... with 143 more rows
 ```
 
-Using the shadow matrix helps you manage where missing values are in
+Binding the shadow data to the data you help keep better track of the
+missing values. This format is called “nabular”, a portmanteau of `NA`
+and `tabular`. You can bind the shadow to the data using `bind_shadow`
+or `nabular`:
+
+``` r
+bind_shadow(airquality)
+#> # A tibble: 153 x 12
+#>    Ozone Solar.R  Wind  Temp Month   Day Ozone_NA Solar.R_NA Wind_NA
+#>    <int>   <int> <dbl> <int> <int> <int> <fct>    <fct>      <fct>  
+#>  1    41     190   7.4    67     5     1 !NA      !NA        !NA    
+#>  2    36     118   8      72     5     2 !NA      !NA        !NA    
+#>  3    12     149  12.6    74     5     3 !NA      !NA        !NA    
+#>  4    18     313  11.5    62     5     4 !NA      !NA        !NA    
+#>  5    NA      NA  14.3    56     5     5 NA       NA         !NA    
+#>  6    28      NA  14.9    66     5     6 !NA      NA         !NA    
+#>  7    23     299   8.6    65     5     7 !NA      !NA        !NA    
+#>  8    19      99  13.8    59     5     8 !NA      !NA        !NA    
+#>  9     8      19  20.1    61     5     9 !NA      !NA        !NA    
+#> 10    NA     194   8.6    69     5    10 NA       !NA        !NA    
+#> # ... with 143 more rows, and 3 more variables: Temp_NA <fct>,
+#> #   Month_NA <fct>, Day_NA <fct>
+nabular(airquality)
+#> # A tibble: 153 x 12
+#>    Ozone Solar.R  Wind  Temp Month   Day Ozone_NA Solar.R_NA Wind_NA
+#>    <int>   <int> <dbl> <int> <int> <int> <fct>    <fct>      <fct>  
+#>  1    41     190   7.4    67     5     1 !NA      !NA        !NA    
+#>  2    36     118   8      72     5     2 !NA      !NA        !NA    
+#>  3    12     149  12.6    74     5     3 !NA      !NA        !NA    
+#>  4    18     313  11.5    62     5     4 !NA      !NA        !NA    
+#>  5    NA      NA  14.3    56     5     5 NA       NA         !NA    
+#>  6    28      NA  14.9    66     5     6 !NA      NA         !NA    
+#>  7    23     299   8.6    65     5     7 !NA      !NA        !NA    
+#>  8    19      99  13.8    59     5     8 !NA      !NA        !NA    
+#>  9     8      19  20.1    61     5     9 !NA      !NA        !NA    
+#> 10    NA     194   8.6    69     5    10 NA       !NA        !NA    
+#> # ... with 143 more rows, and 3 more variables: Temp_NA <fct>,
+#> #   Month_NA <fct>, Day_NA <fct>
+```
+
+Using the nabular format helps you manage where missing values are in
 your dataset and make it easy to do visualisations where you split by
 missingness:
 

diff --git a/man/figures/README-facet-by-month-1.png b/man/figures/README-facet-by-month-1.png
diff --git a/man/figures/README-geom-miss-point-1.png b/man/figures/README-geom-miss-point-1.png
diff --git a/man/figures/README-gg-miss-span-1.png b/man/figures/README-gg-miss-span-1.png
diff --git a/man/figures/README-gg-miss-upset-1.png b/man/figures/README-gg-miss-upset-1.png
diff --git a/man/figures/README-gg-miss-var-1.png b/man/figures/README-gg-miss-var-1.png
diff --git a/man/figures/README-regular-geom-point-1.png b/man/figures/README-regular-geom-point-1.png
diff --git a/man/figures/README-shadow-impute-1.png b/man/figures/README-shadow-impute-1.png
diff --git a/man/figures/README-shadow-w-ggplot-1.png b/man/figures/README-shadow-w-ggplot-1.png
diff --git a/man/is_shade.Rd b/man/is_shade.Rd
diff --git a/man/is_shadow.Rd b/man/is_shadow.Rd
diff --git a/man/nabular.Rd b/man/nabular.Rd
diff --git a/tests/testthat/test-bind-shadow.R b/tests/testthat/test-bind-shadow.R
@@ -7,7 +7,7 @@ test_that("bind_shadow returns a data.frame",{
 
 test_that("bind_shadow returns a tibble",{
   expect_equal(class(bind_shadow(airquality)),
-               c("shadow", "tbl_df", "tbl", "data.frame"))
+               c("nabular", "tbl_df", "tbl", "data.frame"))
 })
 
 test_that("bind_shadow errors when given non dataframe or 0 entry",{