-
Notifications
You must be signed in to change notification settings - Fork 1
/
build_seroprev_data.R
347 lines (294 loc) · 17 KB
/
build_seroprev_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# Process sero(logical) prevalence data @ NHS England regional scales for covidm
# We use a combination of different data sources for England and English regions
# and add new data to existing data on an ad-hoc basis (unlike virus prevalence)
#
# The following data sources are used for regional sero prevalence estimates:
#
# 1. Sero prevalence estimates from the UK Biobank, see:
# https://www.ukbiobank.ac.uk/learn-more-about-uk-biobank/covid-19-hub
#
# 2. Sero prevalence estimates from Imperial College London's REACT-2 study, see:
# https://www.imperial.ac.uk/medicine/research-and-impact/groups/react-study/
#
# 3. Sero prevalence estimates from the Office for National Statistics, see:
# https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/datasets/coronaviruscovid19antibodydatafortheuk
library(readxl)
library(lubridate)
library(stringr) #
library(data.table) # these libraries must be loaded for ogwrangler to work
library(glue) #
library(ogwrangler) # see https://github.com/nicholasdavies/ogwrangler
library(ggplot2)
library(ggthemes)
library(cowplot)
# remove existing objects in environment
rm(list=ls())
# Specify existing .csv file containing seroprevalence estimates
file = './fitting_data/seroprev_nhs_regions_20220421090540.csv'
# Load existing file
sero = read.csv(file)
# Make sure dates in sero are consistent formats
if (is.Date(sero$Start.date) == FALSE){
sero$Start.date = as.Date(sero$Start.date)
}
if (is.Date(sero$End.date) == FALSE){
sero$End.date = as.Date(sero$End.date)
}
# Specify path to file, filename and the publication date of new ONS data
newdata = './fitting_data/covid19infectionsurveyantibodies20220504.xlsx'
# We usually find the data required within Excel sheets 1b, but this may change
# if the format of the published .xlsx file changes, so please check! Note that
# equivalent data for England as a whole is (currently) found in sheet 1a
s1b = read_excel(newdata, sheet = "1b")
s1a = read_excel(newdata, sheet = "1a")
# find last row containing data - this assumes that specific text will continue
# to be placed in the row below the last row of data, so please check in future
last_idx_eng = which(s1a$Contents == 'Source: Office for National Statistics – Coronavirus (COVID-19) Infection Survey')
last_idx = which(s1b$Contents == 'Source: Office for National Statistics – Coronavirus (COVID-19) Infection Survey')
# this also assumes row 4 contains appropriate headers, please check in future
s1ad = read_excel(newdata, sheet = "1a", range = cell_rows(6:last_idx_eng))
s1bd = read_excel(newdata, sheet = "1b", range = cell_rows(5:last_idx))
# get correct date ranges for each row entry in s1ad
start_dates_eng = as.Date(NA)
end_dates_eng = as.Date(NA)
for (i in 1:dim(s1ad)[1]){
daterange = s1ad$`Weekly period`[i]
s1ad_dates = strsplit(daterange, split = " to ")[[1]]
start_dates_eng[i] = as.Date(s1ad_dates[1], format = '%d %b %Y')
end_dates_eng[i] = as.Date(s1ad_dates[2], format = '%d %b %Y')
}
s1ad$Start.date = start_dates_eng
s1ad$End.date = end_dates_eng
# get correct date ranges for each row entry in s1bd
start_dates = as.Date(NA)
end_dates = as.Date(NA)
for (i in 2:dim(s1bd)[1]){
daterange = s1bd$`...1`[i]
s1bd_dates = strsplit(daterange, split = " to ")[[1]]
start_dates[i] = as.Date(s1bd_dates[1], format = '%d %b %Y')
end_dates[i] = as.Date(s1bd_dates[2], format = '%d %b %Y')
}
s1bd$Start.date = start_dates
s1bd$End.date = end_dates
# get PHE regions and codes (used by ONS and REACT)
regions = read.csv("./fitting_data/PHE_England_regions.csv")
# initialise data frames to store resized central, lower, upper bound estimates
rszd_swab_prev = data.frame()
rszd_swab_prev_lb = data.frame()
rszd_swab_prev_ub = data.frame()
# for each date range, convert seroprevalence estimates for all regions
for (i in 2:dim(s1bd)[1]){
# print progress of loop
print(paste0('Processing date ', i-1, ' of ', dim(s1bd)[1]-1))
# get list of central estimates to convert
estimates = as.numeric(c(s1bd$`North East`[i],
s1bd$`North West`[i],
s1bd$`Yorkshire and The Humber`[i],
s1bd$`East Midlands`[i],
s1bd$`West Midlands`[i],
s1bd$`East of England`[i],
s1bd$London[i],
s1bd$`South East`[i],
s1bd$`South West`[i]))
# convert estimates to fit nhs regions
resized_estimates = ogwrangle(estimates, regions$RGN19CD, "e.reg",
"e.nhser20", "pop2018", "proportion")
# store converted central estimates
rszd_swab_prev = rbind(rszd_swab_prev, resized_estimates$estimates)
# get list of lower bound estimates
lb_estimates = as.numeric(c(s1bd$...3[i], # North East
s1bd$...7[i], # North West
s1bd$...11[i], # Yorkshire and The Humber
s1bd$...15[i], # East Midlands
s1bd$...19[i], # West Midlands
s1bd$...23[i], # East of England
s1bd$...27[i], # London
s1bd$...31[i], # South East
s1bd$...35[i])) # South West
# convert lower bound estimates to fit nhs regions
resized_lb_estimates = ogwrangle(lb_estimates, regions$RGN19CD, "e.reg",
"e.nhser20", "pop2018", "proportion")
# store converted lower bound estimates
rszd_swab_prev_lb = rbind(rszd_swab_prev_lb,
resized_lb_estimates$lb_estimates)
# get list of upper bound estimates (skipping values for England)
ub_estimates = as.numeric(c(s1bd$...4[i], # North East
s1bd$...8[i], # North West
s1bd$...12[i], # Yorkshire and The Humber
s1bd$...16[i], # East Midlands
s1bd$...20[i], # West Midlands
s1bd$...24[i], # East of England
s1bd$...28[i], # London
s1bd$...32[i], # South East
s1bd$...36[i])) # South West
# convert estimates to fit nhs regions
resized_ub_estimates = ogwrangle(ub_estimates, regions$RGN19CD, "e.reg",
"e.nhser20", "pop2018", "proportion")
# store converted upper bound estimates
rszd_swab_prev_ub = rbind(rszd_swab_prev_ub,
resized_ub_estimates$ub_estimates)
}
# add column headings to all three data frames
names(rszd_swab_prev) = ogname(resized_estimates$e.nhser20)
names(rszd_swab_prev_lb) = ogname(resized_lb_estimates$e.nhser20)
names(rszd_swab_prev_ub) = ogname(resized_ub_estimates$e.nhser20)
# create list of NHS England regional codes
nhs_regions <- c("E40000009", "E40000010", "E40000008", "E40000007",
"E40000003", "E40000005", "E40000006")
names(nhs_regions) <- ogname(nhs_regions)
# initialise final data frame to store sero prevalence data
sero_prev <- data.frame(NHS.region = NULL, Start.date = NULL, End.date = NULL,
Central.estimate = NULL, Lower.bound = NULL,
Upper.bound = NULL, Test = NULL, Median.or.mean = NULL,
Min.age = NULL, Max.age = NULL, Data.source = NULL,
N.tests = NULL, React2.round = NULL)
# count number of entries of sero prevalence per region in rszd_swab_prev
num_entries <- dim(rszd_swab_prev)[1]
for (i in 1:length(nhs_regions)){
# populate entries for this section of the dataframe
this_region = rep(names(nhs_regions)[i], num_entries)
start_date = as.character(start_dates[1:num_entries+1])
end_date = as.character(end_dates[1:num_entries+1])
central_estimate = rszd_swab_prev[,i]
lower_bound = rszd_swab_prev_lb[,i]
upper_bound = rszd_swab_prev_ub[,i]
test <- rep("", num_entries)
median_mean <- rep("", num_entries)
min_age <- rep("", num_entries)
max_age <- rep("", num_entries)
data_source <- rep("ONS-CIS", num_entries)
N_tests <- rep("", num_entries)
react_round <- rep("", num_entries)
# make temporary dataframe to bind to master dataframe
temp_df <- data.frame(NHS.region = this_region,
Start.date = start_date,
End.date = end_date,
Central.estimate = as.numeric(central_estimate),
Lower.bound = as.numeric(lower_bound),
Upper.bound = as.numeric(upper_bound),
Test = test,
Median.or.mean = median_mean,
Min.age = min_age,
Max.age = max_age,
Data.source = data_source,
N.tests = N_tests,
React2.round = react_round)
# bind temporary dataframe to master dataframe
sero_prev <- rbind(sero_prev, temp_df)
}
# add England-wide estimates to sero_prev
if (1){
cat("Adding England entries...")
# populate entries for this section of the dataframe
num_entries = dim(s1ad)[1]
this_region = rep('England', num_entries)
start_date = as.character(start_dates_eng)
end_date = as.character(end_dates_eng)
central_estimate = s1ad$`Modelled % testing positive for antibodies against SARS-CoV-2`
lower_bound = s1ad$`95% Lower credible interval`
upper_bound = s1ad$`95% Upper credible interval`
test <- rep("", num_entries)
median_mean <- rep("", num_entries)
min_age <- rep("", num_entries)
max_age <- rep("", num_entries)
data_source <- rep("ONS-CIS", num_entries)
N_tests <- rep("", num_entries)
react_round <- rep("", num_entries)
# make temporary dataframe to bind to master dataframe
temp_df <- data.frame(NHS.region = this_region,
Start.date = start_date,
End.date = end_date,
Central.estimate = as.numeric(central_estimate),
Lower.bound = as.numeric(lower_bound),
Upper.bound = as.numeric(upper_bound),
Test = test,
Median.or.mean = median_mean,
Min.age = min_age,
Max.age = max_age,
Data.source = data_source,
N.tests = N_tests,
React2.round = react_round)
# bind temporary dataframe to master dataframe
sero_prev <- rbind(sero_prev, temp_df)
}
# append % to estimates (existing estimates have percentage signs which get
# removed later)
sero_prev$Central.estimate = paste0(sero_prev$Central.estimate, "%")
sero_prev$Lower.bound = paste0(sero_prev$Lower.bound, "%")
sero_prev$Upper.bound = paste0(sero_prev$Upper.bound, "%")
# before binding new ONS-CIS data in sero_prev to old ONS-CIS data in sero,
# remove any old data from ONS-CIS in sero which already exist within the date
# ranges in the new data sero_prev
earliest_date = min(start_dates, na.rm = TRUE)
latest_date = max(end_dates, na.rm = TRUE)
series_dates = seq(earliest_date, latest_date, by = 1)
# get row elements to remove - sero$Data.source is ONS-CIS, sero$Start.date
# occurs in series_dates and sero$End.date occurs in series_dates
rtr = which(sero$Data.source == 'ONS-CIS' & sero$Start.date %in% series_dates &
sero$End.date %in% series_dates)
# remove those rows from sero
sero = sero[-(rtr),]
# bind new data to existing data
final_data = rbind(sero, sero_prev)
# save final data
datetime <- str_replace_all(Sys.time(), "[- :BST]", "")
write.csv(final_data, file=paste0("./fitting_data/seroprev_nhs_regions_",
datetime, ".csv"), row.names = F)
# optional: plots to check data (change to 'if (1)' to execute)
if (0){
# function required to remove % signs
pct = function(x) as.numeric(str_replace_all(x, "%", ""))
# plot all regions together as single data points
ggplot(data = final_data[!(final_data$NHS.region %in% c('Wales','Scotland')),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2, y = pct(Central.estimate), color = NHS.region, shape = Data.source), size = 1.5) +
labs(x = "Date", y = "Seroprevalence (%)", colour = "NHS England region", shape = 'Data source') +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") +
scale_color_colorblind()
# plot all regions together with uncertainty included
ggplot(data = final_data[!(final_data$NHS.region %in% c('Wales','Scotland','England')),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2, y = pct(Central.estimate), color = NHS.region, shape = Data.source), size = 1.25) +
geom_linerange(aes(xmin = Start.date, xmax = End.date, y = pct(Central.estimate), color = NHS.region, linetype = Data.source), size = 0.2) +
geom_linerange(aes(x = Start.date + (End.date-Start.date)/2, ymin = pct(Lower.bound), ymax = pct(Upper.bound), color = NHS.region, linetype = Data.source), size = 0.2) +
labs(x = "Date", y = "Seroprevalence (%)", colour = "NHS England region", shape = 'Data source') +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") +
scale_color_colorblind()
# plot regions separately for comparison
ggplot(data = final_data[!(final_data$NHS.region %in% c('Wales','Scotland','England')),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2, y = pct(Central.estimate), shape = Data.source, color = Data.source), size = 1.5) +
geom_ribbon(aes(x = Start.date + (End.date-Start.date)/2, ymin = pct(Lower.bound),
ymax = pct(Upper.bound)), alpha = 0.3) +
labs(x = "Date", y = "Seroprevalence (%)", shape = 'Data source', colour = 'Data source') +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") +
facet_grid(NHS.region ~ ., scales = "free", switch = "y")
# plot regions separately for comparison with uncertainty
ggplot(data = final_data[!(final_data$NHS.region %in% c('Wales','Scotland','England')),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2, y = pct(Central.estimate), shape = Data.source, color = Data.source), size = 1.5) +
geom_linerange(aes(xmin = Start.date, xmax = End.date, y = pct(Central.estimate), linetype = Data.source), size = 0.2) +
geom_linerange(aes(x = Start.date + (End.date-Start.date)/2, ymin = pct(Lower.bound), ymax = pct(Upper.bound), linetype = Data.source), size = 0.2) +
labs(x = "Date", y = "Seroprevalence (%)", linetype = 'Data source', colour = 'Data source') +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") +
facet_grid(NHS.region ~ ., scales = "free", switch = "y")
# plot one region on it's own
ggplot(data = final_data[(final_data$NHS.region == 'South West'),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2, y = pct(Central.estimate), shape = Data.source, color = Data.source), size = 1.5) +
geom_linerange(aes(xmin = Start.date, xmax = End.date, y = pct(Central.estimate), linetype = Data.source), size = 0.2) +
geom_linerange(aes(x = Start.date + (End.date-Start.date)/2, ymin = pct(Lower.bound), ymax = pct(Upper.bound), linetype = Data.source), size = 0.2) +
labs(x = "Date", y = "Seroprevalence (%)", linetype = 'Data source', colour = 'Data source') +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y")
# plot each NHS England region + England as a whole on separate panels
p = list()
for (i in 1:length(unique(final_data$NHS.region))){
this_region = unique(final_data$NHS.region)[i]
p[[i]] = ggplot(data = final_data[(final_data$NHS.region == this_region),]) +
geom_point(aes(x = Start.date + (End.date-Start.date)/2,
y = pct(Central.estimate), shape = Data.source, color = Data.source), size = 1.5) +
geom_linerange(aes(xmin = Start.date, xmax = End.date, y = pct(Central.estimate), linetype = Data.source), size = 0.2) +
geom_linerange(aes(x = Start.date + (End.date-Start.date)/2, ymin = pct(Lower.bound), ymax = pct(Upper.bound), linetype = Data.source), size = 0.2) +
labs(x = "Date", y = "Seroprevalence (%)", title = this_region) +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") +
theme(legend.position = 'none')
}
c1 = plot_grid(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], nrow = 5)
c2 = plot_grid(p[[6]], p[[7]], p[[8]], p[[9]], p[[10]], nrow = 5)
plot_grid(c1, c2, nrow = 1)
}