-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_heliconia_data.R
220 lines (149 loc) · 7.5 KB
/
clean_heliconia_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
clean_heliconia_data <- function() {
# Code Overview -----------------------------------------------------------
# Script created by Emilio M. Bruna, embruna@ufl.edu
# Script created in R version 3.3.1
# Load libraries ----------------------------------------------------------
library(tidyverse)
library(readxl)
# create version file -----------------------------------------------------
# source("./code/create_version_file.R")
# dataset<-"heliconia_survey_clean"
# create_version_file(dataset)
# STEP 1: Load raw data and prep for cleanup ------------------------------
# load the demog data (for all plots except PA-10) -----------------------
source("./code/survey_cleaning/prep_raw_ha_data.R")
ha_data <- prep_raw_ha_data()
# merge with the census data from PA-10 ha --------------------------------
# The data from Porto Alegre's 10-ha fragment were in a different CSV file.
# They were cleaned up and merged with the rest of the demographic data
# with the function `merge_with_PA10.R`
source("./code/survey_cleaning/merge_with_PA10.R")
ha_data <- merge_with_PA10(ha_data)
# Add a unique plant_id index number for each plant -----------------------
# adding the plant id number here means any subsequent edits or correction
# (now or post-publication will *not* change a plant's unique ID number.
ha_data <- ha_data %>%
group_by(plot, row, column, tag_number) %>%
mutate(plant_id = cur_group_id(), .before = 1)
# STEP 2: Cleanup & Corrections -------------------------------------------
# clean-up the `codes` column ---------------------------------------------
# The survey team often recorded observations about individual plants or
# the conditions in plots. These were entered as numerical codes. The
# function `clean_codes.R` converts them to text to simplify clean-up
source("./code/survey_cleaning/cleanup_codes.R")
ha_data <- cleanup_codes(ha_data)
# STEP 3: Corrections to demographic data ----------------------------------------
# corrections for each demographic plot are found in a separate function
# Continuous Forest
# CF-1 (aka Florestal)
source("./code/survey_cleaning/correct_florestal.R")
ha_data <- correct_florestal(ha_data)
# CF-2 (aka Esteio/Camp 41, 5750)
source("./code/survey_cleaning/correct_5750.R")
ha_data <- correct_5750(ha_data)
# CF-3 (aka Esteio/Camp 41, 5756)
source("./code/survey_cleaning/correct_5756.R")
ha_data <- correct_5756(ha_data)
# CF-4 (aka Dimona-CF)
source("./code/survey_cleaning/correct_dimona_cf.R")
ha_data <- correct_dimona_cf(ha_data)
# CF-5 (aka Porto Alegre CF)
source("./code/survey_cleaning/correct_pa_cf.R")
ha_data <- correct_pa_cf(ha_data)
# CF-6 (aka CaboFrio-CF)
source("./code/survey_cleaning/correct_cabofrio_cf.R")
ha_data <- correct_cabofrio_cf(ha_data)
# 1-ha Fragments
# FF-1 (aka Dimona 1-ha 2107)
source("./code/survey_cleaning/correct_2107.R")
ha_data <- correct_2107(ha_data)
# FF-2 (aka Dimona 1-ha 2108)
source("./code/survey_cleaning/correct_2108.R")
ha_data <- correct_2108(ha_data)
# Corrections FF-3 (aka Colosso 1-ha, 5751) -----------------------------
source("./code/survey_cleaning/correct_5751.R")
ha_data <- correct_5751(ha_data)
# FF-4 (aka Porto Alegre 1-ha, 5753)
source("./code/survey_cleaning/correct_5753.R")
ha_data <- correct_5753(ha_data)
# 10-ha fragments
# FF-5 (aka Dimona 10-ha 2206)
source("./code/survey_cleaning/correct_2206.R")
ha_data <- correct_2206(ha_data)
# FF-6 (aka Colosso 10-ha 5752)
source("./code/survey_cleaning/correct_5752.R")
ha_data <- correct_5752(ha_data)
# FF-7 (aka Porto Alegre 10-ha, 5754)
source("./code/survey_cleaning/correct_5754.R")
ha_data <- correct_5754(ha_data)
# STEP 4: data validation ----------------------------------------
# validation of tag numbers
source("./code/survey_cleaning/check_tags.R")
ha_data <- check_tags(ha_data)
# Find and label plants found without tags ------------------------
# Sometimes the survey team simply misses established plants ("adults" that are
# in a plot, perhaps because density is so high or it is difficult to see
# individuals in treefalls. Sometimes plants that were marked will also lose
# their tags - a branch could fall on them and knock it off, or it could
# be lost in a treefall, or someone walking though the plot might have kicked
# it. All of these plants are given a new tag, with a notation recorded in the
# survey data sheet that they were an "adult" `plant without tag`, a
# `new plant in plot`, or `ULY` (i.e., 'unmarked last year').
# This function:
# (1) identifies all plants flagged as 'established plants found without a tag
# in a survey',
# (2) creates a csv file of these plants for follow-up review,
# (3) identifies & creates a csv file of "ULY" plants flagged in 1999, and
# (4) removes the flag from all plants marked this way in 1999.
# Why 1999? Plots were still being completely surveyed through 99, which is why
# so many ULY were found in that year.
# (5) Finally, this function creates a new column to
# indicate if plants were `found_without_tag` using the logical TRUE/FALSE
#
# source("./code/survey_cleaning/check_no_tag.R")
# ha_data <- check_no_tag(ha_data)
# creating/validating `status` columns
# `Infl` column is conditional: given that a plant is reproductive,
# how many infloresences does it have? This function scans to
# make sure the entries are properly coded.
# this function codes a plant as a new seedling in a given survey year (TRUE/FALSE)
# This codes the options in the branchfall / treefall column
# sometimes the survey team took recorded notes about the physiological
# condition of plants (e.g. resprouting, dried up). this function creates a
# column indicating the condition of plants.
# add column indicating if plant was found or missing in survey yr --------
# add census_status (measured/missing)
# This adds a column indicating the status of plants in each year's census
# (alive, dead, missing,NA...). It's a little more complicated than it sounds;
# see function for details. The NA plants are duplicate tag numbers, most of
# which have no measurments of plant size
source("./code/survey_cleaning/check_condition.R")
ha_data <- check_condition(ha_data)
# function checks the column of each plant in a survey year (recorded, dead, missing)
source("./code/survey_cleaning/check_census_status.R")
ha_data <- check_census_status(ha_data)
# final tweak ------------------------------------------------------------
ha_data <- ha_data %>%
arrange(row, as.numeric(column)) %>%
mutate(subplot = paste(row, column, sep = ""))
# STEP 5: Save the file -------------------------------------------------
if (!dir.exists("./data/survey_clean")) {
dir.create("./data/survey_clean")
} else {
# print(" ")
# Nothing - this is a placeholder
}
write_csv(ha_data, "./data/survey_clean/heliconia_survey_clean.csv")
x <- "\n
------------------------------------------------------------------
The raw Heliconia census data have been merged, cleaned, and
organized. You may now proceed to prepare the dataset for
uploading to Dryad.
A csv file of this data is saved in the following folder:
`data/survey_clean/heliconia_survey_clean.csv`.
------------------------------------------------------------------
\n"
writeLines(x)
no_status<-ha_data %>% filter(is.na(census_status))
return(ha_data)
}