Merge pull request #21 from bjcairns/dev

Release 0.2.1 (closes #19)
bjcairns · Aug 3, 2019 · 6ea851f · 6ea851f
2 parents 5a8c6c7 + b5f3435
commit 6ea851f
Show file tree

Hide file tree

Showing 10 changed files with 163 additions and 49 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@
 README.html
 
 tests/test-data
+meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: ukbschemas
 Type: Package
 Title: Download the UK Biobank data schemas to an SQLite database
-Version: 0.2.0
+Version: 0.2.1
 Author: Benjamin J. Cairns <ben.cairns@ndph.ox.ac.uk>
 Maintainer: Benjamin J. Cairns <ben.cairns@ndph.ox.ac.uk>
 Description: ukbschema downloads the UK Biobank data dictionaries (schemas) from the UK Biobank Data Showcase and stores them in an SQLite database. The database can be loaded back into R, or queried using sqlite3 or any of a wide range of computing packages.

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,10 @@
+# ukbschemas 0.2.1
+
+* Added further tables with information about field properties `stability`, `item_type`, `strata` and `sexed`
+* Minor additions to `README.md`
+* Added a `NEWS.md` file to track changes to the package.
+
+# ukbschemas 0.2.0
+
+* First public release
+* Create a database containing a curated set of tables storing the UK Biobank data schemas
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/R/tidy-schemas.R b/R/tidy-schemas.R
@@ -8,18 +8,31 @@
 
   # Add the missing valuetypes table
   sch <- 
-    sch %>% append(list(valuetypes=VALUE_TYPES))
-  if (!silent) cat("... Added table `valuetypes`\n")
+    sch %>% append(
+      list(
+        valuetypes = VALUE_TYPES,
+        stability = STABILITY,
+        itemtypes = ITEM_TYPES,
+        strata = STRATA,
+        sexed = SEXED
+      )
+    )
+  if (!silent) cat("... Added property type tables\n")
 
   # Rename columns as needed
   sch$fields <- sch$fields %>%
-    dplyr::rename(value_type_id = .data$value_type)
+    dplyr::rename(
+      value_type_id = .data$value_type,
+      stability_id = .data$stability,
+      item_type_id = .data$item_type,
+      strata_id = .data$strata,
+      sexed_id = .data$sexed,
+      category_id = .data$main_category
+    )
   sch$encodings <- sch$encodings %>%
     dplyr::rename(value_type_id = .data$coded_as)
-  sch$fields <- sch$fields %>%
-    dplyr::rename(category_id = .data$main_category)
   if (!silent) {
-    cat("... Rename to value_type_id in tables ")
+    cat("... Rename coded properties in tables ")
     cat("`fields` and `encodings` \n")
   }
 

diff --git a/README.Rmd b/README.Rmd
@@ -76,6 +76,8 @@ db <- ukbschemas_db(path = tempdir(), overwrite = TRUE, as_is = TRUE)
 
 The `overwrite` option allows the database file to be overwritten (if `TRUE`), or prevents this (`FALSE`), or if not specified and the session is interactive (`interactive() == TRUE`) then the user is prompted to decide.
 
+**Note:** If you have created a schemas database with an earlier version of ukbschemas, it should be possible to load that database with the latest version of `load_db()`, which (currently) should load any SQLite database, regardless of contents.
+
 #### Load-Save workflow
 
 The second approach is to download the schemas and store them in memory in a list, and save them to a database only as requried. 
@@ -98,10 +100,12 @@ This package was originally written in bash (a Unix shell scripting language). H
 * All the encoding value tables (`esimpint`, `esimpstring`, `esimpreal`, `esimpdate`, `ehierint`, `ehierstring`) have been harmonised and combined into a single table `encvalues`. The `value` column in `encvalues` has type `TEXT`, but a `type` column has been added in case the value is not clear from context. The original type-specific tables have been deleted.
 * To avoid redunancy, category parent-child relationships have been moved to table `categories`, as column `parent_id`, from table `catbrowse` (which has been deleted).
 * Reference to the category to which a field belongs is in the `main_category` column in the `fields` schema, but has been renamed to `category_id` for consistency with the `categories` schema.
-* The value types described [on the UKB Showcase](http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type) have been added manually to a table `valuetypes` and appropriate ID references have been renamed to `value_type_id` in tables `fields` and `encodings`.
+* Details of several of the field properties (`value_type`, `stability`, `item_type`, `strata` and `sexed`) are available elsewhere on the Data Showcase. These have been added manually to tables `valuetypes`, `stability`, `itemtypes`, `strata` and `sexed`, and appropriate ID references have been renamed with the `_id` suffix in tables `fields` and `encodings`.
+* There are several columns in the tables which are not well-documented (e.g. `base_type` in fields, `availability` in `encodings` and `categories`, and others). Additional tables documenting these encoded values may be included in future versions (and suggestions are welcome).
 
 #### Known code issues
 
-* The UK Biobank data schemas are regularly updated as new data are added to the system. ukbschemas does not currently include a facility for updating the database; it is necessary to create a new database.
+* The UK Biobank data schemas are regularly updated as new data are added to the system. ukbschemas does not currently include a facility for updating the database; it is necessary to create a new database. 
+* Because `readr::read_csv()` reads whole numbers as type `double`, not `integer` (allowing 64-bit integers without loss of information), column types in schemas loaded in R will differ depending on whether the schemas are loaded directly to R or first saved to a database. This should make little or no difference for most applications.
 * Any [other issues](https://github.com/bjcairns/ukbschemas/issues).
 
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ By default, the database is named `ukb-schemas-YYYY-MM-DD.sqlite` (where
 `YYYY-MM-DD` is the current date) and placed in the current working
 directory. (`path = tempdir()` in the above example puts it in the
 current temporary directory instead.) At the most recent compilation of
-the database (19 July 2019), the size of the .sqlite database file
+the database (03 August 2019), the size of the .sqlite database file
 produced by `ukbschemas_db()` was approximately 10.1MB.
 
 Note that without further arguments, `ukbschemas_db()` tidies up the
@@ -66,6 +66,11 @@ The `overwrite` option allows the database file to be overwritten (if
 is interactive (`interactive() == TRUE`) then the user is prompted to
 decide.
 
+**Note:** If you have created a schemas database with an earlier version
+of ukbschemas, it should be possible to load that database with the
+latest version of `load_db()`, which (currently) should load any SQLite
+database, regardless of contents.
+
 #### Load-Save workflow
 
 The second approach is to download the schemas and store them in memory
@@ -104,16 +109,28 @@ software (not even SQLite).
   - Reference to the category to which a field belongs is in the
     `main_category` column in the `fields` schema, but has been renamed
     to `category_id` for consistency with the `categories` schema.
-  - The value types described [on the UKB
-    Showcase](http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type)
-    have been added manually to a table `valuetypes` and appropriate ID
-    references have been renamed to `value_type_id` in tables `fields`
-    and `encodings`.
+  - Details of several of the field properties (`value_type`,
+    `stability`, `item_type`, `strata` and `sexed`) are available
+    elsewhere on the Data Showcase. These have been added manually to
+    tables `valuetypes`, `stability`, `itemtypes`, `strata` and `sexed`,
+    and appropriate ID references have been renamed with the `_id`
+    suffix in tables `fields` and `encodings`.
+  - There are several columns in the tables which are not
+    well-documented (e.g. `base_type` in fields, `availability` in
+    `encodings` and `categories`, and others). Additional tables
+    documenting these encoded values may be included in future versions
+    (and suggestions are welcome).
 
 #### Known code issues
 
   - The UK Biobank data schemas are regularly updated as new data are
     added to the system. ukbschemas does not currently include a
     facility for updating the database; it is necessary to create a new
     database.
+  - Because `readr::read_csv()` reads whole numbers as type `double`,
+    not `integer` (allowing 64-bit integers without loss of
+    information), column types in schemas loaded in R will differ
+    depending on whether the schemas are loaded directly to R or first
+    saved to a database. This should make little or no difference for
+    most applications.
   - Any [other issues](https://github.com/bjcairns/ukbschemas/issues).
diff --git a/data-raw/aux_tables.R b/data-raw/aux_tables.R
@@ -0,0 +1,69 @@
+# Value types from 
+#   http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type
+# Note that the IDs for "Binary object" and "Records" have been set arbitrarily.
+# The value_type_id 0 has been added for consistency with the coded_as column 
+# of `encodings`.
+#
+# Last update: 2019-03-16
+VALUE_TYPES <- tibble::tribble(
+  ~value_type_id, ~title, ~description,
+  0L, "", "",
+  11L, "Integer", "whole numbers, for example the age of a participant on a particular date",
+  21L, "Categorical (single)", "a single answer selected from a coded list or tree of mutually exclusive options, for example a yes/no choice",
+  22L, "Categorical (multiple)", "sets of answers selected from a coded list or tree of options, for instance concurrent medications",
+  31L, "Continuous", "floating-point numbers, for example the height of a participant",
+  41L, "Text", "data composed of alphanumeric characters, for example the first line of an address",
+  51L, "Date", "a calendar date, for example 14th October 2010",
+  61L, "Time", "a time, for example 13:38:05 on 14th October 2010",
+  101L, "Compound", "a set of values required as a whole to describe some compound property, for example an ECG trace",
+  998L, "Binary object", "a complex dataset (blob), for example an image",
+  999L, "Records", "a summary showing the volume of records data available via the secure portal"
+)
+
+# Stability from
+#   https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=stability
+# Note that the ID for "Obsolete" has been set arbitrarily.
+#
+# Last update: 2019-08-02
+STABILITY <- tibble::tribble(
+  ~stability_id, ~title, ~description,
+  0L, "Complete", "all data has been collected and will never change, an example would be the date at which a participant joined UK Biobank",
+  1L, "Updateable", "all data has been collected, but the values may change over time, an example would be the volume of the initial blood samples collected by UKB which will decrease as new analyses are performed using them",
+  2L, "Accruing", "data is still being gathered",
+  3L, "Ongoing", "data is still being gathered and the values already held may change over time",
+  4L, "Obsolete", "data which has been superceded by other fields and is not recommended for use"
+)
+
+# Item types from
+#   https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=item_type
+#
+# Last update: 2019-08-02
+ITEM_TYPES <- tibble::tribble(
+  ~item_type_id, ~title, ~description,
+  0L, "Data", "data values, of elementary types or with simple structures",
+  10L, "Samples", "inventory information corresponding to biological samples held by UK Biobank",
+  20L, "Bulk", "large complex objects, typically binary files (blobs) which cannot be decomposed into smaller chunks",
+  30L, "Records", "inventory information describing the number of (records) held"
+)
+
+# Strata from
+#   https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=strata
+#
+# Last update: 2019-08-02
+STRATA <- tibble::tribble(
+  ~strata_id, ~title, ~description,
+  0L, "Primary", "the key clinically/scientifically relevant data-fields",
+  1L, "Supporting", "data which is clinical/scientific in nature, but largely superceded by a Primary data-field",
+  2L, "Auxiliary", "data which describes the systems or processes used to acquire the data",
+  3L, "Derived", "data which has been constructed by combining/processing values from one or more other data-fields"
+)
+
+# Sexed from
+#   
+# Last update: 2019-08-02
+SEXED <- tibble::tribble(
+  ~sexed_id, ~title, ~description,
+  0L, "Unisex", "for example height",
+  1L, "Males only", "for example prostate cancer history",
+  2L, "Females only", "for example age at menopause"
+)
diff --git a/data-raw/internal-variables.R b/data-raw/internal-variables.R
@@ -30,29 +30,6 @@ ids <- as.integer(c(1:14, 999))
 SCHEMA_FILENAMES <- tibble::tibble(id = ids, filename = filenames)
 
 
-# Value types from 
-#   http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type
-# Note that the IDs for "Binary object" and "Records" have been set arbitrarily.
-# The value_type_id 0 has been added for consistency with the coded_as column 
-# of `encodings`.
-#
-# Last update: 2019-03-16
-VALUE_TYPES <- tibble::tribble(
-  ~value_type_id, ~title, ~description,
-  0L, "", "",
-  11L, "Integer", "whole numbers, for example the age of a participant on a particular date",
-  21L, "Categorical (single)", "a single answer selected from a coded list or tree of mutually exclusive options, for example a yes/no choice",
-  22L, "Categorical (multiple)", "sets of answers selected from a coded list or tree of options, for instance concurrent medications",
-  31L, "Continuous", "floating-point numbers, for example the height of a participant",
-  41L, "Text", "data composed of alphanumeric characters, for example the first line of an address",
-  51L, "Date", "a calendar date, for example 14th October 2010",
-  61L, "Time", "a time, for example 13:38:05 on 14th October 2010",
-  101L, "Compound", "a set of values required as a whole to describe some compound property, for example an ECG trace",
-  998L, "Binary object", "a complex dataset (blob), for example an image",
-  999L, "Records", "a summary showing the volume of records data available via the secure portal"
-)
-
-
 # Errors
 UKBSCHEMAS_ERRORS <- list(
   OVERWRITE = "Will not overwrite existing file without 'overwrite=TRUE'",
@@ -65,10 +42,13 @@ UKBSCHEMAS_ERRORS <- list(
   WARN_DB_CONNECTED = "Database object is already connected"
 )
 
+# Additional tables to add to the schemas
+source("data-raw/aux_tables.R")
+
 
 # Export to R/sysdata.rda
 usethis::use_data(
-  UKB_SCHEMAS_URL, UKB_URL_PREFIX, SCHEMA_FILENAMES, VALUE_TYPES, 
-  UKBSCHEMAS_ERRORS,
+  VALUE_TYPES, STABILITY, ITEM_TYPES, STRATA, SEXED,
+  UKB_SCHEMAS_URL, UKB_URL_PREFIX, SCHEMA_FILENAMES, UKBSCHEMAS_ERRORS,
   internal = TRUE, overwrite = TRUE
 )
diff --git a/inst/sql/ukb-schemas.sql b/inst/sql/ukb-schemas.sql
@@ -2,15 +2,15 @@ CREATE TABLE fields(
   "field_id" INTEGER PRIMARY KEY,
   "title" TEXT,
   "availability" INTEGER,
-  "stability" INTEGER,
+  "stability_id" INTEGER,
   "private" INTEGER,
   "value_type_id" INTEGER, /* In the original table this column is called "value_type" */
   "base_type" INTEGER,
-  "item_type" INTEGER,
-  "strata" INTEGER,
+  "item_type_id" INTEGER,
+  "strata_id" INTEGER,
   "instanced" INTEGER,
   "arrayed" INTEGER,
-  "sexed" INTEGER,
+  "sexed_id" INTEGER,
   "units" TEXT,
   "category_id" INTEGER,
   "encoding_id" INTEGER,
@@ -69,11 +69,6 @@ CREATE TABLE recommended(
   "category_id" INTEGER,
   "field_id" INTEGER
 );
-CREATE TABLE valuetypes(
-  "value_type_id" INTEGER PRIMARY KEY,
-  "title" TEXT,
-  "description" TEXT
-);
 CREATE TABLE encvalues(
   "encoding_id" INTEGER,
   "code_id" INTEGER,
@@ -91,3 +86,28 @@ CREATE TABLE schema(
   "descript" TEXT,
   "notes" TEXT
 );
+CREATE TABLE valuetypes(
+  "value_type_id" INTEGER PRIMARY KEY,
+  "title" TEXT,
+  "description" TEXT
+);
+CREATE TABLE stability(
+  "stability_id" INTEGER PRIMARY KEY,
+  "title" TEXT,
+  "description" TEXT
+);
+CREATE TABLE itemtypes(
+  "item_type_id" INTEGER PRIMARY KEY,
+  "title" TEXT,
+  "description" TEXT
+);
+CREATE TABLE strata(
+  "strata_id" INTEGER PRIMARY KEY,
+  "title" TEXT,
+  "description" TEXT
+);
+CREATE TABLE sexed(
+  "sexed_id" INTEGER PRIMARY KEY,
+  "title" TEXT,
+  "description" TEXT
+);