diff --git a/.gitignore b/.gitignore index a126660..000d6b5 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ README.html tests/test-data +meta diff --git a/DESCRIPTION b/DESCRIPTION index 7efefa9..d33d0e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ukbschemas Type: Package Title: Download the UK Biobank data schemas to an SQLite database -Version: 0.2.0 +Version: 0.2.1 Author: Benjamin J. Cairns Maintainer: Benjamin J. Cairns Description: ukbschema downloads the UK Biobank data dictionaries (schemas) from the UK Biobank Data Showcase and stores them in an SQLite database. The database can be loaded back into R, or queried using sqlite3 or any of a wide range of computing packages. diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..2b148b5 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,10 @@ +# ukbschemas 0.2.1 + +* Added further tables with information about field properties `stability`, `item_type`, `strata` and `sexed` +* Minor additions to `README.md` +* Added a `NEWS.md` file to track changes to the package. + +# ukbschemas 0.2.0 + +* First public release +* Create a database containing a curated set of tables storing the UK Biobank data schemas \ No newline at end of file diff --git a/R/sysdata.rda b/R/sysdata.rda index 7c3fdc2..194bad1 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/tidy-schemas.R b/R/tidy-schemas.R index f757c28..1d34bcf 100644 --- a/R/tidy-schemas.R +++ b/R/tidy-schemas.R @@ -8,18 +8,31 @@ # Add the missing valuetypes table sch <- - sch %>% append(list(valuetypes=VALUE_TYPES)) - if (!silent) cat("... Added table `valuetypes`\n") + sch %>% append( + list( + valuetypes = VALUE_TYPES, + stability = STABILITY, + itemtypes = ITEM_TYPES, + strata = STRATA, + sexed = SEXED + ) + ) + if (!silent) cat("... Added property type tables\n") # Rename columns as needed sch$fields <- sch$fields %>% - dplyr::rename(value_type_id = .data$value_type) + dplyr::rename( + value_type_id = .data$value_type, + stability_id = .data$stability, + item_type_id = .data$item_type, + strata_id = .data$strata, + sexed_id = .data$sexed, + category_id = .data$main_category + ) sch$encodings <- sch$encodings %>% dplyr::rename(value_type_id = .data$coded_as) - sch$fields <- sch$fields %>% - dplyr::rename(category_id = .data$main_category) if (!silent) { - cat("... Rename to value_type_id in tables ") + cat("... Rename coded properties in tables ") cat("`fields` and `encodings` \n") } diff --git a/README.Rmd b/README.Rmd index dbcc39a..0181267 100644 --- a/README.Rmd +++ b/README.Rmd @@ -76,6 +76,8 @@ db <- ukbschemas_db(path = tempdir(), overwrite = TRUE, as_is = TRUE) The `overwrite` option allows the database file to be overwritten (if `TRUE`), or prevents this (`FALSE`), or if not specified and the session is interactive (`interactive() == TRUE`) then the user is prompted to decide. +**Note:** If you have created a schemas database with an earlier version of ukbschemas, it should be possible to load that database with the latest version of `load_db()`, which (currently) should load any SQLite database, regardless of contents. + #### Load-Save workflow The second approach is to download the schemas and store them in memory in a list, and save them to a database only as requried. @@ -98,10 +100,12 @@ This package was originally written in bash (a Unix shell scripting language). H * All the encoding value tables (`esimpint`, `esimpstring`, `esimpreal`, `esimpdate`, `ehierint`, `ehierstring`) have been harmonised and combined into a single table `encvalues`. The `value` column in `encvalues` has type `TEXT`, but a `type` column has been added in case the value is not clear from context. The original type-specific tables have been deleted. * To avoid redunancy, category parent-child relationships have been moved to table `categories`, as column `parent_id`, from table `catbrowse` (which has been deleted). * Reference to the category to which a field belongs is in the `main_category` column in the `fields` schema, but has been renamed to `category_id` for consistency with the `categories` schema. -* The value types described [on the UKB Showcase](http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type) have been added manually to a table `valuetypes` and appropriate ID references have been renamed to `value_type_id` in tables `fields` and `encodings`. +* Details of several of the field properties (`value_type`, `stability`, `item_type`, `strata` and `sexed`) are available elsewhere on the Data Showcase. These have been added manually to tables `valuetypes`, `stability`, `itemtypes`, `strata` and `sexed`, and appropriate ID references have been renamed with the `_id` suffix in tables `fields` and `encodings`. +* There are several columns in the tables which are not well-documented (e.g. `base_type` in fields, `availability` in `encodings` and `categories`, and others). Additional tables documenting these encoded values may be included in future versions (and suggestions are welcome). #### Known code issues -* The UK Biobank data schemas are regularly updated as new data are added to the system. ukbschemas does not currently include a facility for updating the database; it is necessary to create a new database. +* The UK Biobank data schemas are regularly updated as new data are added to the system. ukbschemas does not currently include a facility for updating the database; it is necessary to create a new database. +* Because `readr::read_csv()` reads whole numbers as type `double`, not `integer` (allowing 64-bit integers without loss of information), column types in schemas loaded in R will differ depending on whether the schemas are loaded directly to R or first saved to a database. This should make little or no difference for most applications. * Any [other issues](https://github.com/bjcairns/ukbschemas/issues). diff --git a/README.md b/README.md index 5dc530c..72f88e7 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ By default, the database is named `ukb-schemas-YYYY-MM-DD.sqlite` (where `YYYY-MM-DD` is the current date) and placed in the current working directory. (`path = tempdir()` in the above example puts it in the current temporary directory instead.) At the most recent compilation of -the database (19 July 2019), the size of the .sqlite database file +the database (03 August 2019), the size of the .sqlite database file produced by `ukbschemas_db()` was approximately 10.1MB. Note that without further arguments, `ukbschemas_db()` tidies up the @@ -66,6 +66,11 @@ The `overwrite` option allows the database file to be overwritten (if is interactive (`interactive() == TRUE`) then the user is prompted to decide. +**Note:** If you have created a schemas database with an earlier version +of ukbschemas, it should be possible to load that database with the +latest version of `load_db()`, which (currently) should load any SQLite +database, regardless of contents. + #### Load-Save workflow The second approach is to download the schemas and store them in memory @@ -104,11 +109,17 @@ software (not even SQLite). - Reference to the category to which a field belongs is in the `main_category` column in the `fields` schema, but has been renamed to `category_id` for consistency with the `categories` schema. - - The value types described [on the UKB - Showcase](http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type) - have been added manually to a table `valuetypes` and appropriate ID - references have been renamed to `value_type_id` in tables `fields` - and `encodings`. + - Details of several of the field properties (`value_type`, + `stability`, `item_type`, `strata` and `sexed`) are available + elsewhere on the Data Showcase. These have been added manually to + tables `valuetypes`, `stability`, `itemtypes`, `strata` and `sexed`, + and appropriate ID references have been renamed with the `_id` + suffix in tables `fields` and `encodings`. + - There are several columns in the tables which are not + well-documented (e.g. `base_type` in fields, `availability` in + `encodings` and `categories`, and others). Additional tables + documenting these encoded values may be included in future versions + (and suggestions are welcome). #### Known code issues @@ -116,4 +127,10 @@ software (not even SQLite). added to the system. ukbschemas does not currently include a facility for updating the database; it is necessary to create a new database. + - Because `readr::read_csv()` reads whole numbers as type `double`, + not `integer` (allowing 64-bit integers without loss of + information), column types in schemas loaded in R will differ + depending on whether the schemas are loaded directly to R or first + saved to a database. This should make little or no difference for + most applications. - Any [other issues](https://github.com/bjcairns/ukbschemas/issues). diff --git a/data-raw/aux_tables.R b/data-raw/aux_tables.R new file mode 100644 index 0000000..7fe8459 --- /dev/null +++ b/data-raw/aux_tables.R @@ -0,0 +1,69 @@ +# Value types from +# http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type +# Note that the IDs for "Binary object" and "Records" have been set arbitrarily. +# The value_type_id 0 has been added for consistency with the coded_as column +# of `encodings`. +# +# Last update: 2019-03-16 +VALUE_TYPES <- tibble::tribble( + ~value_type_id, ~title, ~description, + 0L, "", "", + 11L, "Integer", "whole numbers, for example the age of a participant on a particular date", + 21L, "Categorical (single)", "a single answer selected from a coded list or tree of mutually exclusive options, for example a yes/no choice", + 22L, "Categorical (multiple)", "sets of answers selected from a coded list or tree of options, for instance concurrent medications", + 31L, "Continuous", "floating-point numbers, for example the height of a participant", + 41L, "Text", "data composed of alphanumeric characters, for example the first line of an address", + 51L, "Date", "a calendar date, for example 14th October 2010", + 61L, "Time", "a time, for example 13:38:05 on 14th October 2010", + 101L, "Compound", "a set of values required as a whole to describe some compound property, for example an ECG trace", + 998L, "Binary object", "a complex dataset (blob), for example an image", + 999L, "Records", "a summary showing the volume of records data available via the secure portal" +) + +# Stability from +# https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=stability +# Note that the ID for "Obsolete" has been set arbitrarily. +# +# Last update: 2019-08-02 +STABILITY <- tibble::tribble( + ~stability_id, ~title, ~description, + 0L, "Complete", "all data has been collected and will never change, an example would be the date at which a participant joined UK Biobank", + 1L, "Updateable", "all data has been collected, but the values may change over time, an example would be the volume of the initial blood samples collected by UKB which will decrease as new analyses are performed using them", + 2L, "Accruing", "data is still being gathered", + 3L, "Ongoing", "data is still being gathered and the values already held may change over time", + 4L, "Obsolete", "data which has been superceded by other fields and is not recommended for use" +) + +# Item types from +# https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=item_type +# +# Last update: 2019-08-02 +ITEM_TYPES <- tibble::tribble( + ~item_type_id, ~title, ~description, + 0L, "Data", "data values, of elementary types or with simple structures", + 10L, "Samples", "inventory information corresponding to biological samples held by UK Biobank", + 20L, "Bulk", "large complex objects, typically binary files (blobs) which cannot be decomposed into smaller chunks", + 30L, "Records", "inventory information describing the number of (records) held" +) + +# Strata from +# https://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=strata +# +# Last update: 2019-08-02 +STRATA <- tibble::tribble( + ~strata_id, ~title, ~description, + 0L, "Primary", "the key clinically/scientifically relevant data-fields", + 1L, "Supporting", "data which is clinical/scientific in nature, but largely superceded by a Primary data-field", + 2L, "Auxiliary", "data which describes the systems or processes used to acquire the data", + 3L, "Derived", "data which has been constructed by combining/processing values from one or more other data-fields" +) + +# Sexed from +# +# Last update: 2019-08-02 +SEXED <- tibble::tribble( + ~sexed_id, ~title, ~description, + 0L, "Unisex", "for example height", + 1L, "Males only", "for example prostate cancer history", + 2L, "Females only", "for example age at menopause" +) diff --git a/data-raw/internal-variables.R b/data-raw/internal-variables.R index 0cf4825..54e1132 100644 --- a/data-raw/internal-variables.R +++ b/data-raw/internal-variables.R @@ -30,29 +30,6 @@ ids <- as.integer(c(1:14, 999)) SCHEMA_FILENAMES <- tibble::tibble(id = ids, filename = filenames) -# Value types from -# http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=value_type -# Note that the IDs for "Binary object" and "Records" have been set arbitrarily. -# The value_type_id 0 has been added for consistency with the coded_as column -# of `encodings`. -# -# Last update: 2019-03-16 -VALUE_TYPES <- tibble::tribble( - ~value_type_id, ~title, ~description, - 0L, "", "", - 11L, "Integer", "whole numbers, for example the age of a participant on a particular date", - 21L, "Categorical (single)", "a single answer selected from a coded list or tree of mutually exclusive options, for example a yes/no choice", - 22L, "Categorical (multiple)", "sets of answers selected from a coded list or tree of options, for instance concurrent medications", - 31L, "Continuous", "floating-point numbers, for example the height of a participant", - 41L, "Text", "data composed of alphanumeric characters, for example the first line of an address", - 51L, "Date", "a calendar date, for example 14th October 2010", - 61L, "Time", "a time, for example 13:38:05 on 14th October 2010", - 101L, "Compound", "a set of values required as a whole to describe some compound property, for example an ECG trace", - 998L, "Binary object", "a complex dataset (blob), for example an image", - 999L, "Records", "a summary showing the volume of records data available via the secure portal" -) - - # Errors UKBSCHEMAS_ERRORS <- list( OVERWRITE = "Will not overwrite existing file without 'overwrite=TRUE'", @@ -65,10 +42,13 @@ UKBSCHEMAS_ERRORS <- list( WARN_DB_CONNECTED = "Database object is already connected" ) +# Additional tables to add to the schemas +source("data-raw/aux_tables.R") + # Export to R/sysdata.rda usethis::use_data( - UKB_SCHEMAS_URL, UKB_URL_PREFIX, SCHEMA_FILENAMES, VALUE_TYPES, - UKBSCHEMAS_ERRORS, + VALUE_TYPES, STABILITY, ITEM_TYPES, STRATA, SEXED, + UKB_SCHEMAS_URL, UKB_URL_PREFIX, SCHEMA_FILENAMES, UKBSCHEMAS_ERRORS, internal = TRUE, overwrite = TRUE ) diff --git a/inst/sql/ukb-schemas.sql b/inst/sql/ukb-schemas.sql index 7c60b5e..1462e3d 100644 --- a/inst/sql/ukb-schemas.sql +++ b/inst/sql/ukb-schemas.sql @@ -2,15 +2,15 @@ CREATE TABLE fields( "field_id" INTEGER PRIMARY KEY, "title" TEXT, "availability" INTEGER, - "stability" INTEGER, + "stability_id" INTEGER, "private" INTEGER, "value_type_id" INTEGER, /* In the original table this column is called "value_type" */ "base_type" INTEGER, - "item_type" INTEGER, - "strata" INTEGER, + "item_type_id" INTEGER, + "strata_id" INTEGER, "instanced" INTEGER, "arrayed" INTEGER, - "sexed" INTEGER, + "sexed_id" INTEGER, "units" TEXT, "category_id" INTEGER, "encoding_id" INTEGER, @@ -69,11 +69,6 @@ CREATE TABLE recommended( "category_id" INTEGER, "field_id" INTEGER ); -CREATE TABLE valuetypes( - "value_type_id" INTEGER PRIMARY KEY, - "title" TEXT, - "description" TEXT -); CREATE TABLE encvalues( "encoding_id" INTEGER, "code_id" INTEGER, @@ -91,3 +86,28 @@ CREATE TABLE schema( "descript" TEXT, "notes" TEXT ); +CREATE TABLE valuetypes( + "value_type_id" INTEGER PRIMARY KEY, + "title" TEXT, + "description" TEXT +); +CREATE TABLE stability( + "stability_id" INTEGER PRIMARY KEY, + "title" TEXT, + "description" TEXT +); +CREATE TABLE itemtypes( + "item_type_id" INTEGER PRIMARY KEY, + "title" TEXT, + "description" TEXT +); +CREATE TABLE strata( + "strata_id" INTEGER PRIMARY KEY, + "title" TEXT, + "description" TEXT +); +CREATE TABLE sexed( + "sexed_id" INTEGER PRIMARY KEY, + "title" TEXT, + "description" TEXT +);