From 14d4edfacc1a7bf36f851ec595d0d1f728dd9b07 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 3 Oct 2024 21:17:35 +0000 Subject: [PATCH 01/74] Push for raw upload --- .../spatial/spatial-traffic.R | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R new file mode 100644 index 000000000..d74b02a6e --- /dev/null +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R @@ -0,0 +1,84 @@ +# Load necessary libraries +if (!requireNamespace("httr", quietly = TRUE)) install.packages("httr") +if (!requireNamespace("sf", quietly = TRUE)) install.packages("sf") +if (!requireNamespace("tools", quietly = TRUE)) install.packages("tools") +if (!requireNamespace("aws.s3", quietly = TRUE)) install.packages("aws.s3") +if (!requireNamespace("arrow", quietly = TRUE)) install.packages("arrow") + +library(aws.s3) +library(dplyr) +library(purrr) +library(sf) +library(arrow) + +# Define S3 bucket and paths +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment") +current_year <- strftime(Sys.Date(), "%Y") + +# Function to process each year and upload shapefiles for that specific year to S3 +process_shapefiles_for_year <- function(year) { + # Define the URL for the shapefile ZIP file, dynamically for each year + url <- paste0("https://apps1.dot.illinois.gov/gist2/gisdata/all", year, ".zip") + + # Create a temporary file to store the downloaded ZIP + temp_zip <- tempfile(fileext = ".zip") + temp_dir <- tempdir() + + # Use httr to download the ZIP file to a temporary location + response <- httr::GET(url) + + # Check if the request was successful + if (httr::status_code(response) == 200) { + # Save the content of the response as a ZIP file in a temporary location + writeBin(httr::content(response, "raw"), temp_zip) + message(paste("Shapefile ZIP for year", year, "downloaded successfully.")) + + # Unzip the file into a temporary directory + utils::unzip(temp_zip, exdir = temp_dir) + message(paste("Shapefile for year", year, "unzipped into temporary directory.")) + + # List files in the unzipped directory and look for the .shp files + unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) + shp_file_for_year <- unzipped_files[grepl(paste0("T2HWY", year), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] + + # Process only the shapefile that matches the current year + if (length(shp_file_for_year) == 1) { + # Read the shapefile into the environment using sf::st_read + shapefile_data <- sf::st_read(shp_file_for_year) + + # Create a temporary file to save the shapefile as GeoParquet for S3 upload + temp_parquet <- tempfile(fileext = ".parquet") + + # Save the shapefile as a GeoParquet file + sf::write_parquet(shapefile_data, temp_parquet) + + # Define remote file path in S3 + remote_file_path <- file.path(output_bucket, paste0("T2HWY_", year, ".parquet")) + + # Upload to S3 if it doesn't already exist + if (!aws.s3::object_exists(remote_file_path)) { + message(paste("Uploading T2HWY_", year, "to S3 as Parquet...")) + put_object(file = temp_parquet, object = remote_file_path, bucket = AWS_S3_RAW_BUCKET) + + message(paste("Shapefile T2HWY", year, "uploaded to S3 at:", remote_file_path)) + } else { + message(paste("Shapefile T2HWY", year, "already exists in S3, skipping upload.")) + } + + # Clean up temporary files + file.remove(temp_parquet) + + } else { + message(paste("No shapefile found for year", year, ".")) + } + + } else { + message(paste("Failed to retrieve the file for year", year, ". Status code: ", httr::status_code(response))) + } +} + +# Loop through the years from 2012 to the current year and process each shapefile +for (year in 2012:current_year) { + process_shapefiles_for_year(year) +} From 2ecffe16a7059030eb549070cd5a3bfb7d823561 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 3 Oct 2024 21:25:48 +0000 Subject: [PATCH 02/74] Remove unnecessary code --- .../spatial/spatial-traffic.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R index d74b02a6e..f69f3f596 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R @@ -1,10 +1,3 @@ -# Load necessary libraries -if (!requireNamespace("httr", quietly = TRUE)) install.packages("httr") -if (!requireNamespace("sf", quietly = TRUE)) install.packages("sf") -if (!requireNamespace("tools", quietly = TRUE)) install.packages("tools") -if (!requireNamespace("aws.s3", quietly = TRUE)) install.packages("aws.s3") -if (!requireNamespace("arrow", quietly = TRUE)) install.packages("arrow") - library(aws.s3) library(dplyr) library(purrr) @@ -51,7 +44,7 @@ process_shapefiles_for_year <- function(year) { temp_parquet <- tempfile(fileext = ".parquet") # Save the shapefile as a GeoParquet file - sf::write_parquet(shapefile_data, temp_parquet) + sf::st_write_parquet(shapefile_data, temp_parquet) # Define remote file path in S3 remote_file_path <- file.path(output_bucket, paste0("T2HWY_", year, ".parquet")) From e0c05aec962037f7b2a676158a0e10f2b2265d42 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 3 Oct 2024 21:26:05 +0000 Subject: [PATCH 03/74] Testing file --- traffic.R | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 traffic.R diff --git a/traffic.R b/traffic.R new file mode 100644 index 000000000..cfa3ad30b --- /dev/null +++ b/traffic.R @@ -0,0 +1,106 @@ +library(sf) +library(DBI) +library(noctua) +library(dplyr) +library(leaflet) +library(ggplot2) + +shapefile <- read_sf("etl/scripts-ccao-data-raw-us-east-1/spatial/traffic_data.shp") + + +filtered_data <- shapefile %>% + filter(COUNTY_NAM == "COOK") %>% + st_as_sf() %>% + st_transform(crs = 4326) %>% + mutate(AADT_STRIN = as.numeric(AADT_STRIN)) + +ggplot() + + geom_sf(data = filtered_data, aes(color = AADT_STRIN), size = 1) + # Adjust size as needed + scale_color_viridis_c(option = "plasma", name = "AADT_STRIN") + # Use viridis color scale for better visualization + labs( + title = "Map of AADT_STRIN in Cook County", + subtitle = "Visualizing Traffic Data on Cook County Roads", + x = "Longitude", + y = "Latitude" + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5) + ) + +filtered_data <- st_zm(filtered_data) + + +pal <- colorNumeric(palette = "viridis", domain = filtered_data$AADT_STRIN) + +leaflet(filtered_data) %>% + addProviderTiles("CartoDB.Positron") %>% # Add a base map layer + addPolylines( + color = ~pal(AADT_STRIN), # Use the color palette based on AADT_STRIN + weight = 2, # Adjust line thickness + opacity = 0.7, # Adjust line transparency + popup = ~paste("AADT_STRIN:", AADT_STRIN) # Add popups to show AADT_STRIN values + ) %>% + addLegend( + pal = pal, + values = ~AADT_STRIN, + opacity = 0.7, + title = "AADT_STRIN", + position = "bottomright" + ) %>% + setView(lng = mean(st_coordinates(filtered_data)[, 1]), + lat = mean(st_coordinates(filtered_data)[, 2]), + zoom = 10) # Adjust zoom level and map center + +con <- dbConnect(noctua::athena()) + +# Assuming the original CRS is EPSG:3857 (Web Mercator), adjust this if necessary +secondary_roads <- dbGetQuery(con, 'SELECT * FROM "spatial"."secondary_road" WHERE CAST(year AS INTEGER) = 2023') %>% + st_as_sf() %>% + st_set_crs(3435) + + +# Leaflet map combining both filtered_data and secondary_roads + # Leaflet map combining both filtered_data and secondary_roads + leaflet() %>% + addProviderTiles("CartoDB.Positron") %>% # Base map layer + addPolylines( + data = filtered_data, + color = ~pal(AADT_STRIN), # Color based on AADT_STRIN + weight = 2, # Adjust line thickness + opacity = 0.7, # Adjust transparency + popup = ~paste("AADT_STRIN:", AADT_STRIN) # Popup for AADT_STRIN + ) %>% + addPolylines( + data = secondary_roads, + color = "blue", # Color for secondary roads + weight = 1, # Adjust line thickness for secondary roads + opacity = 0.6, # Adjust transparency for secondary roads + popup = ~paste("Road Name:", name) # Add popups for secondary roads + ) %>% + addLegend( + pal = pal, + values = filtered_data$AADT_STRIN, + opacity = 0.7, + title = "AADT_STRIN", + position = "bottomright" + ) %>% + setView( + lng = mean(st_coordinates(filtered_data)[, 1]), + lat = mean(st_coordinates(filtered_data)[, 2]), + zoom = 10 # Adjust zoom level + ) + + + # Step 2: Buffer the geometries by 50 feet (around filtered_data) + filtered_data_buffer <- st_buffer(filtered_data, dist = 50) + + # Step 3: Spatial join to find intersections within 50 feet + joined_data <- st_join(secondary_roads, filtered_data_buffer, join = st_intersects) + + # Step 4: Optionally, filter for rows where intersections occurred + joined_data_within_50ft <- joined_data %>% + filter(!is.na(AADT_STRIN)) # AADT_STRIN is from filtered_data, so this filters where the join occurred + + From d145bae6f98b9d87005e8995959bab548b99a7a5 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 3 Oct 2024 21:57:35 +0000 Subject: [PATCH 04/74] Modify HWY so it looks back to 2012 --- etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R index f69f3f596..ef5d9cbcf 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R @@ -33,7 +33,7 @@ process_shapefiles_for_year <- function(year) { # List files in the unzipped directory and look for the .shp files unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) - shp_file_for_year <- unzipped_files[grepl(paste0("T2HWY", year), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] + shp_file_for_year <- unzipped_files[grepl(paste0("HWY", year), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] # Process only the shapefile that matches the current year if (length(shp_file_for_year) == 1) { @@ -44,7 +44,7 @@ process_shapefiles_for_year <- function(year) { temp_parquet <- tempfile(fileext = ".parquet") # Save the shapefile as a GeoParquet file - sf::st_write_parquet(shapefile_data, temp_parquet) + geoarrow::write_geoparquet(shapefile_data, temp_parquet) # Define remote file path in S3 remote_file_path <- file.path(output_bucket, paste0("T2HWY_", year, ".parquet")) From b749f3ad027b16411e7830427eb5807adebb641f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 4 Oct 2024 15:49:52 +0000 Subject: [PATCH 05/74] Minor simplifications --- etl/renv.lock | 36 ++++++++- .../spatial/spatial-traffic.R | 80 ++++++++----------- 2 files changed, 68 insertions(+), 48 deletions(-) diff --git a/etl/renv.lock b/etl/renv.lock index 35498d897..80382383c 100644 --- a/etl/renv.lock +++ b/etl/renv.lock @@ -142,6 +142,20 @@ ], "Hash": "ae4a925e0f6bb1b7e5fa96b739c5221a" }, + "RSocrata": { + "Package": "RSocrata", + "Version": "1.7.15-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "httr", + "jsonlite", + "mime", + "plyr" + ], + "Hash": "435ebea3fa736ab1317c79a5fa34fa55" + }, "Rcpp": { "Package": "Rcpp", "Version": "1.0.12", @@ -1926,8 +1940,13 @@ "noctua": { "Package": "noctua", "Version": "2.6.2", - "Source": "Repository", - "Repository": "CRAN", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteUsername": "DyfanJones", + "RemoteRepo": "noctua", + "RemoteRef": "master", + "RemoteSha": "23a4cfbf537407c7a1547fc13ba771ba2eb098e0", "Requirements": [ "DBI", "R", @@ -1938,7 +1957,7 @@ "utils", "uuid" ], - "Hash": "c03d73125d695e80b35b4bb3eacf0358" + "Hash": "a48e1decdd027c44ea6b97b0fe0950cb" }, "numDeriv": { "Package": "numDeriv", @@ -2276,6 +2295,17 @@ "Repository": "CRAN", "Hash": "09eb987710984fc2905c7129c7d85e65" }, + "plyr": { + "Package": "plyr", + "Version": "1.8.9", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "6b8177fd19982f0020743fadbfdbd933" + }, "png": { "Package": "png", "Version": "0.1-8", diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R index ef5d9cbcf..a0bbf3a8a 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R @@ -1,77 +1,67 @@ library(aws.s3) library(dplyr) +library(httr) +library(lubridate) library(purrr) library(sf) library(arrow) # Define S3 bucket and paths AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment") +output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment", "traffic") current_year <- strftime(Sys.Date(), "%Y") +# Get list of available files +years <- map(2012:year(Sys.Date()), \(x){ + if (HEAD(paste0( + "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip" + ))$status_code == 200) { + x + } +}) %>% + unlist() + # Function to process each year and upload shapefiles for that specific year to S3 -process_shapefiles_for_year <- function(year) { - # Define the URL for the shapefile ZIP file, dynamically for each year - url <- paste0("https://apps1.dot.illinois.gov/gist2/gisdata/all", year, ".zip") +process_shapefiles_for_year <- map(years, \(x) { + + remote_file_path <- file.path(output_bucket, paste0(x, ".parquet")) - # Create a temporary file to store the downloaded ZIP - temp_zip <- tempfile(fileext = ".zip") - temp_dir <- tempdir() + # Skip everything if file already exists + if (!object_exists(remote_file_path)) { + # Define the URL for the shapefile ZIP file, dynamically for each year + url <- paste0("https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") - # Use httr to download the ZIP file to a temporary location - response <- httr::GET(url) + # Create a temporary file to store the downloaded ZIP + temp_zip <- tempfile(fileext = ".zip") + temp_dir <- tempdir() - # Check if the request was successful - if (httr::status_code(response) == 200) { - # Save the content of the response as a ZIP file in a temporary location - writeBin(httr::content(response, "raw"), temp_zip) - message(paste("Shapefile ZIP for year", year, "downloaded successfully.")) + # Download the ZIP file to a temporary location + download.file(url = url, destfile = temp_zip) + + message(paste("Shapefile ZIP for year", x, "downloaded successfully.")) # Unzip the file into a temporary directory - utils::unzip(temp_zip, exdir = temp_dir) - message(paste("Shapefile for year", year, "unzipped into temporary directory.")) + unzip(temp_zip, exdir = temp_dir) + message(paste("Shapefile for year", x, "unzipped into temporary directory.")) # List files in the unzipped directory and look for the .shp files unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) - shp_file_for_year <- unzipped_files[grepl(paste0("HWY", year), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] + shp_file_for_year <- unzipped_files[grepl(paste0("HWY", x), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] # Process only the shapefile that matches the current year if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) - # Create a temporary file to save the shapefile as GeoParquet for S3 upload - temp_parquet <- tempfile(fileext = ".parquet") - # Save the shapefile as a GeoParquet file - geoarrow::write_geoparquet(shapefile_data, temp_parquet) - - # Define remote file path in S3 - remote_file_path <- file.path(output_bucket, paste0("T2HWY_", year, ".parquet")) - - # Upload to S3 if it doesn't already exist - if (!aws.s3::object_exists(remote_file_path)) { - message(paste("Uploading T2HWY_", year, "to S3 as Parquet...")) - put_object(file = temp_parquet, object = remote_file_path, bucket = AWS_S3_RAW_BUCKET) - - message(paste("Shapefile T2HWY", year, "uploaded to S3 at:", remote_file_path)) - } else { - message(paste("Shapefile T2HWY", year, "already exists in S3, skipping upload.")) - } - - # Clean up temporary files - file.remove(temp_parquet) + geoarrow::write_geoparquet(shapefile_data, remote_file_path) } else { - message(paste("No shapefile found for year", year, ".")) + message(paste("No shapefile found for year", x, ".")) } - } else { - message(paste("Failed to retrieve the file for year", year, ". Status code: ", httr::status_code(response))) } -} -# Loop through the years from 2012 to the current year and process each shapefile -for (year in 2012:current_year) { - process_shapefiles_for_year(year) -} +}) + +unlink(temp_dir, recursive = TRUE) From 2bfc81397c3dfbb70007f12b3f2b62de86b829bf Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Fri, 4 Oct 2024 18:09:27 +0000 Subject: [PATCH 06/74] Add cleaning script --- ...raffic.R => spatial-environment-traffic.R} | 0 .../spatial/spatial-environment-traffic.R | 52 +++++++++++++++++++ 2 files changed, 52 insertions(+) rename etl/scripts-ccao-data-raw-us-east-1/spatial/{spatial-traffic.R => spatial-environment-traffic.R} (100%) create mode 100644 etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R similarity index 100% rename from etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-traffic.R rename to etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R new file mode 100644 index 000000000..cce13e99c --- /dev/null +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -0,0 +1,52 @@ +# Load required libraries +library(aws.s3) +library(dplyr) +library(purrr) +library(sf) + +# Define S3 bucket and paths +AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" +AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") +raw_bucket_prefix <- "spatial/environment/traffic/" +warehouse_bucket_path <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") + +# List files from the raw bucket +raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = raw_bucket_prefix) + + +process_files_from_raw_bucket <- map(raw_files$Key, \(file_key) { + + # Skip if the file is not a .parquet file + if (!grepl("\\.parquet$", file_key)) { + message(paste("Skipping non-parquet file:", file_key)) + return(NULL) + } + + # Download the file locally for inspection + local_parquet_file <- tempfile(fileext = ".parquet") + + # Corrected: Pass only the bucket name and file key + save_object(file = local_parquet_file, object = file_key, bucket = AWS_S3_RAW_BUCKET) + + # Read the parquet file using geoarrow + shapefile_data <- geoarrow::read_geoparquet(local_parquet_file) + + # Define the columns you want to select. These change over time, so a strict select isn't great. + # But all columns are present from 2014 on. + required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY") + + # Select only the columns that exist in the dataset + existing_columns <- intersect(required_columns, colnames(shapefile_data)) + selected_columns <- shapefile_data %>% + select(all_of(existing_columns)) + + # Show the first few rows of the selected columns for inspection + print(paste("File:", file_key)) + print(head(selected_columns)) + + # Clean up the temporary local file + unlink(local_parquet_file) + +}) + From da0791d2b83eaa7c5954c4452d8686dcbe195ff5 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Fri, 4 Oct 2024 18:15:14 +0000 Subject: [PATCH 07/74] Quick edit --- .../spatial/spatial-environment-traffic.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index cce13e99c..0f8f9fe9e 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -7,21 +7,14 @@ library(sf) # Define S3 bucket and paths AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -raw_bucket_prefix <- "spatial/environment/traffic/" warehouse_bucket_path <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") # List files from the raw bucket -raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = raw_bucket_prefix) +raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = warehouse_bucket_path) process_files_from_raw_bucket <- map(raw_files$Key, \(file_key) { - # Skip if the file is not a .parquet file - if (!grepl("\\.parquet$", file_key)) { - message(paste("Skipping non-parquet file:", file_key)) - return(NULL) - } - # Download the file locally for inspection local_parquet_file <- tempfile(fileext = ".parquet") From b15eafa24b8876f5c7484575cb314fa514c63ee8 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Fri, 4 Oct 2024 18:21:25 +0000 Subject: [PATCH 08/74] Push correct version --- .../spatial/spatial-environment-traffic.R | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 0f8f9fe9e..c46ad48bd 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -3,16 +3,20 @@ library(aws.s3) library(dplyr) library(purrr) library(sf) +library(geoarrow) -# Define S3 bucket and paths -AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" +# Define S3 bucket and paths for raw and warehouse +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") + +# Paths to raw and warehouse buckets +raw_bucket_path <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment", "traffic") warehouse_bucket_path <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") # List files from the raw bucket -raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = warehouse_bucket_path) - +raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = "spatial/environment/traffic/") +# Process each file from the raw bucket process_files_from_raw_bucket <- map(raw_files$Key, \(file_key) { # Download the file locally for inspection @@ -34,12 +38,11 @@ process_files_from_raw_bucket <- map(raw_files$Key, \(file_key) { selected_columns <- shapefile_data %>% select(all_of(existing_columns)) - # Show the first few rows of the selected columns for inspection - print(paste("File:", file_key)) - print(head(selected_columns)) - # Clean up the temporary local file unlink(local_parquet_file) -}) + # Optionally, write processed data back to warehouse bucket + output_file <- file.path(warehouse_bucket_path, file_key) + geoarrow::write_geoparquet(selected_columns, output_file) +}) From bc1070c4054428be3be9c5e6cf595634664941e9 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 17:09:20 +0000 Subject: [PATCH 09/74] Remove old file --- traffic.R | 106 ------------------------------------------------------ 1 file changed, 106 deletions(-) delete mode 100644 traffic.R diff --git a/traffic.R b/traffic.R deleted file mode 100644 index cfa3ad30b..000000000 --- a/traffic.R +++ /dev/null @@ -1,106 +0,0 @@ -library(sf) -library(DBI) -library(noctua) -library(dplyr) -library(leaflet) -library(ggplot2) - -shapefile <- read_sf("etl/scripts-ccao-data-raw-us-east-1/spatial/traffic_data.shp") - - -filtered_data <- shapefile %>% - filter(COUNTY_NAM == "COOK") %>% - st_as_sf() %>% - st_transform(crs = 4326) %>% - mutate(AADT_STRIN = as.numeric(AADT_STRIN)) - -ggplot() + - geom_sf(data = filtered_data, aes(color = AADT_STRIN), size = 1) + # Adjust size as needed - scale_color_viridis_c(option = "plasma", name = "AADT_STRIN") + # Use viridis color scale for better visualization - labs( - title = "Map of AADT_STRIN in Cook County", - subtitle = "Visualizing Traffic Data on Cook County Roads", - x = "Longitude", - y = "Latitude" - ) + - theme_minimal() + - theme( - plot.title = element_text(hjust = 0.5), - plot.subtitle = element_text(hjust = 0.5) - ) - -filtered_data <- st_zm(filtered_data) - - -pal <- colorNumeric(palette = "viridis", domain = filtered_data$AADT_STRIN) - -leaflet(filtered_data) %>% - addProviderTiles("CartoDB.Positron") %>% # Add a base map layer - addPolylines( - color = ~pal(AADT_STRIN), # Use the color palette based on AADT_STRIN - weight = 2, # Adjust line thickness - opacity = 0.7, # Adjust line transparency - popup = ~paste("AADT_STRIN:", AADT_STRIN) # Add popups to show AADT_STRIN values - ) %>% - addLegend( - pal = pal, - values = ~AADT_STRIN, - opacity = 0.7, - title = "AADT_STRIN", - position = "bottomright" - ) %>% - setView(lng = mean(st_coordinates(filtered_data)[, 1]), - lat = mean(st_coordinates(filtered_data)[, 2]), - zoom = 10) # Adjust zoom level and map center - -con <- dbConnect(noctua::athena()) - -# Assuming the original CRS is EPSG:3857 (Web Mercator), adjust this if necessary -secondary_roads <- dbGetQuery(con, 'SELECT * FROM "spatial"."secondary_road" WHERE CAST(year AS INTEGER) = 2023') %>% - st_as_sf() %>% - st_set_crs(3435) - - -# Leaflet map combining both filtered_data and secondary_roads - # Leaflet map combining both filtered_data and secondary_roads - leaflet() %>% - addProviderTiles("CartoDB.Positron") %>% # Base map layer - addPolylines( - data = filtered_data, - color = ~pal(AADT_STRIN), # Color based on AADT_STRIN - weight = 2, # Adjust line thickness - opacity = 0.7, # Adjust transparency - popup = ~paste("AADT_STRIN:", AADT_STRIN) # Popup for AADT_STRIN - ) %>% - addPolylines( - data = secondary_roads, - color = "blue", # Color for secondary roads - weight = 1, # Adjust line thickness for secondary roads - opacity = 0.6, # Adjust transparency for secondary roads - popup = ~paste("Road Name:", name) # Add popups for secondary roads - ) %>% - addLegend( - pal = pal, - values = filtered_data$AADT_STRIN, - opacity = 0.7, - title = "AADT_STRIN", - position = "bottomright" - ) %>% - setView( - lng = mean(st_coordinates(filtered_data)[, 1]), - lat = mean(st_coordinates(filtered_data)[, 2]), - zoom = 10 # Adjust zoom level - ) - - - # Step 2: Buffer the geometries by 50 feet (around filtered_data) - filtered_data_buffer <- st_buffer(filtered_data, dist = 50) - - # Step 3: Spatial join to find intersections within 50 feet - joined_data <- st_join(secondary_roads, filtered_data_buffer, join = st_intersects) - - # Step 4: Optionally, filter for rows where intersections occurred - joined_data_within_50ft <- joined_data %>% - filter(!is.na(AADT_STRIN)) # AADT_STRIN is from filtered_data, so this filters where the join occurred - - From 917c48d118244058a425a3848a67a494d54e2826 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 19:42:40 +0000 Subject: [PATCH 10/74] Working script --- .../spatial/spatial-environment-traffic.R | 58 +++++-------------- 1 file changed, 15 insertions(+), 43 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index c46ad48bd..dcc3f18f4 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -1,48 +1,20 @@ -# Load required libraries library(aws.s3) library(dplyr) -library(purrr) library(sf) library(geoarrow) -# Define S3 bucket and paths for raw and warehouse -AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") - -# Paths to raw and warehouse buckets -raw_bucket_path <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment", "traffic") -warehouse_bucket_path <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") - -# List files from the raw bucket -raw_files <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = "spatial/environment/traffic/") - -# Process each file from the raw bucket -process_files_from_raw_bucket <- map(raw_files$Key, \(file_key) { - - # Download the file locally for inspection - local_parquet_file <- tempfile(fileext = ".parquet") - - # Corrected: Pass only the bucket name and file key - save_object(file = local_parquet_file, object = file_key, bucket = AWS_S3_RAW_BUCKET) - - # Read the parquet file using geoarrow - shapefile_data <- geoarrow::read_geoparquet(local_parquet_file) - - # Define the columns you want to select. These change over time, so a strict select isn't great. - # But all columns are present from 2014 on. - required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY") - - # Select only the columns that exist in the dataset - existing_columns <- intersect(required_columns, colnames(shapefile_data)) - selected_columns <- shapefile_data %>% - select(all_of(existing_columns)) - - # Clean up the temporary local file - unlink(local_parquet_file) - - # Optionally, write processed data back to warehouse bucket - output_file <- file.path(warehouse_bucket_path, file_key) - geoarrow::write_geoparquet(selected_columns, output_file) - -}) +# Define the S3 bucket and file path +AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" +file_key <- "spatial/environment/traffic/2023.parquet" + +# Pipeline: download, read, and process the data with lowercase column names +shapefile_data <- tempfile(fileext = ".parquet") %>% + {save_object(object = file_key, bucket = AWS_S3_RAW_BUCKET, file = .); .} %>% + geoarrow::read_geoparquet() %>% + mutate(geometry = st_as_sfc(geometry)) %>% + st_as_sf() %>% + st_transform(4326) %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) %>% + select(all_of(intersect(c("lns", "surf_typ", "surf_wth", "srf_yr", "aadt", "crs_with", "crs_opp", "crs_yr", + "road_name", "dtress_wth", "dtress_opp", "sp_lim", "inventory", "geometry_3435"), + tolower(colnames(.))))) From 936b972803b0aa09498f14578d846dd6e2a14ed7 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 20:36:36 +0000 Subject: [PATCH 11/74] Working loop --- .../spatial/spatial-environment-traffic.R | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index dcc3f18f4..8c8ce0a65 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -3,18 +3,50 @@ library(dplyr) library(sf) library(geoarrow) -# Define the S3 bucket and file path +# Define the S3 bucket and folder path AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" -file_key <- "spatial/environment/traffic/2023.parquet" - -# Pipeline: download, read, and process the data with lowercase column names -shapefile_data <- tempfile(fileext = ".parquet") %>% - {save_object(object = file_key, bucket = AWS_S3_RAW_BUCKET, file = .); .} %>% - geoarrow::read_geoparquet() %>% - mutate(geometry = st_as_sfc(geometry)) %>% - st_as_sf() %>% - st_transform(4326) %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) %>% - select(all_of(intersect(c("lns", "surf_typ", "surf_wth", "srf_yr", "aadt", "crs_with", "crs_opp", "crs_yr", - "road_name", "dtress_wth", "dtress_opp", "sp_lim", "inventory", "geometry_3435"), - tolower(colnames(.))))) +AWS_S3_WAREHOUSE_BUCKET <- "ccao-data-warehouse-us-east-1" +s3_folder <- "spatial/environment/traffic/" +output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") + +# List all the files in the S3 folder +files_in_s3 <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) + +# Filter for files that match a 'parquet' pattern +parquet_files <- files_in_s3 %>% + filter(grepl("\\.parquet$", Key)) %>% + pull(Key) + +# Loop through each parquet file and process it +for (file_key in parquet_files) { + message("Processing file: ", file_key) + + # Download the file from S3 as a raw connection into a temporary file + temp_file <- tempfile(fileext = ".parquet") + save_object(object = file_key, bucket = AWS_S3_RAW_BUCKET, file = temp_file) + + # Read the downloaded file using geoarrow into the R environment + shapefile_data <- geoarrow::read_geoparquet(temp_file) + + # Ensure geometry column is in 'sf' format + shapefile_data$geometry <- st_as_sfc(shapefile_data$geometry) + + shapefile_data <- shapefile_data %>% + st_as_sf() %>% + st_transform(4326) %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) + + # Define the columns you want to select + required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY", "geometry_3435") + + # Select only the non-geometry columns that exist in the dataset + existing_columns <- intersect(required_columns, colnames(shapefile_data)) + selected_columns <- shapefile_data %>% + select(all_of(existing_columns)) + + # Clean up the temporary file + unlink(temp_file) +} + +message("Processing completed for all files.") From 1bbcce48f9b961e832ebbd90858a8438283d90db Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 20:53:41 +0000 Subject: [PATCH 12/74] Billy edits --- .../spatial/spatial-environment-traffic.R | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 8c8ce0a65..80b55a264 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -12,21 +12,18 @@ output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "t # List all the files in the S3 folder files_in_s3 <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) -# Filter for files that match a 'parquet' pattern +# Get the 'Key' (file path) for all files parquet_files <- files_in_s3 %>% - filter(grepl("\\.parquet$", Key)) %>% pull(Key) # Loop through each parquet file and process it for (file_key in parquet_files) { - message("Processing file: ", file_key) - # Download the file from S3 as a raw connection into a temporary file - temp_file <- tempfile(fileext = ".parquet") - save_object(object = file_key, bucket = AWS_S3_RAW_BUCKET, file = temp_file) + # Read the parquet file directly from S3 using aws.s3 functions + obj <- get_object(object = file_key, bucket = AWS_S3_RAW_BUCKET) - # Read the downloaded file using geoarrow into the R environment - shapefile_data <- geoarrow::read_geoparquet(temp_file) + # Convert the S3 object into raw data and read using geoarrow + shapefile_data <- geoarrow::read_geoparquet(rawConnection(obj)) # Ensure geometry column is in 'sf' format shapefile_data$geometry <- st_as_sfc(shapefile_data$geometry) @@ -45,8 +42,18 @@ for (file_key in parquet_files) { selected_columns <- shapefile_data %>% select(all_of(existing_columns)) - # Clean up the temporary file - unlink(temp_file) -} + # Create a temporary file for saving the processed data + output_file <- tempfile(fileext = ".parquet") + + # Write the selected columns to a new parquet file + geoarrow::write_geoparquet(selected_columns, output_file) + + # Define the output file path in the S3 bucket + output_key <- file.path(output_bucket, basename(file_key)) -message("Processing completed for all files.") + # Upload the processed file to the S3 output bucket + put_object(file = output_file, object = output_key, bucket = AWS_S3_WAREHOUSE_BUCKET) + + # Clean up the temporary files + unlink(output_file) +} From 1e4475199712bee29b1ebef0d08053fb662bfff0 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 20:54:46 +0000 Subject: [PATCH 13/74] text edits --- .../spatial/spatial-environment-traffic.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 80b55a264..728c8e746 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -9,10 +9,9 @@ AWS_S3_WAREHOUSE_BUCKET <- "ccao-data-warehouse-us-east-1" s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") -# List all the files in the S3 folder files_in_s3 <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) -# Get the 'Key' (file path) for all files +# Get the 'Key' parquet_files <- files_in_s3 %>% pull(Key) @@ -25,7 +24,7 @@ for (file_key in parquet_files) { # Convert the S3 object into raw data and read using geoarrow shapefile_data <- geoarrow::read_geoparquet(rawConnection(obj)) - # Ensure geometry column is in 'sf' format + # Convert geometry column to 'sf' format shapefile_data$geometry <- st_as_sfc(shapefile_data$geometry) shapefile_data <- shapefile_data %>% @@ -33,7 +32,8 @@ for (file_key in parquet_files) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # Define the columns you want to select + # Define the columns you want to select. We do this because some columns are not present in older + # versions of the data required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY", "geometry_3435") From 21aa4382520ff844c8f24efe3bb7b3c6272134e0 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 20:56:49 +0000 Subject: [PATCH 14/74] Use correct buckets --- .../spatial/spatial-environment-traffic.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 728c8e746..7e5fc9fff 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -4,8 +4,8 @@ library(sf) library(geoarrow) # Define the S3 bucket and folder path -AWS_S3_RAW_BUCKET <- "ccao-data-raw-us-east-1" -AWS_S3_WAREHOUSE_BUCKET <- "ccao-data-warehouse-us-east-1" +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") From 83a0e4e4038b78a7f17efd5bde1e44f682ab995d Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 7 Oct 2024 20:57:59 +0000 Subject: [PATCH 15/74] Text edits --- .../spatial/spatial-environment-traffic.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 7e5fc9fff..86cae8342 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -32,8 +32,7 @@ for (file_key in parquet_files) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # Define the columns you want to select. We do this because some columns are not present in older - # versions of the data + # We do this because some columns are not present in older versions of the data required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY", "geometry_3435") From 32065710bc023332235a595d902340f9698dce53 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 15:01:03 +0000 Subject: [PATCH 16/74] lintr --- .../spatial/spatial-environment-traffic.R | 16 +++++++++----- .../spatial/spatial-environment-traffic.R | 21 ++++++++++++------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index a0bbf3a8a..204e44d33 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -8,20 +8,23 @@ library(arrow) # Define S3 bucket and paths AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment", "traffic") +output_bucket <- file.path(AWS_S3_RAW_BUCKET, + "spatial", "environment", "traffic") current_year <- strftime(Sys.Date(), "%Y") # Get list of available files years <- map(2012:year(Sys.Date()), \(x){ if (HEAD(paste0( - "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip" + "https://apps1.dot.illinois.gov/gist2/gisdata/all", + x, ".zip" ))$status_code == 200) { x } }) %>% unlist() -# Function to process each year and upload shapefiles for that specific year to S3 +# Function to process each year and upload shapefiles for +# that specific year to S3 process_shapefiles_for_year <- map(years, \(x) { remote_file_path <- file.path(output_bucket, paste0(x, ".parquet")) @@ -29,7 +32,8 @@ process_shapefiles_for_year <- map(years, \(x) { # Skip everything if file already exists if (!object_exists(remote_file_path)) { # Define the URL for the shapefile ZIP file, dynamically for each year - url <- paste0("https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") + url <- paste0( + "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") # Create a temporary file to store the downloaded ZIP temp_zip <- tempfile(fileext = ".zip") @@ -46,7 +50,9 @@ process_shapefiles_for_year <- map(years, \(x) { # List files in the unzipped directory and look for the .shp files unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) - shp_file_for_year <- unzipped_files[grepl(paste0("HWY", x), unzipped_files, ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] + shp_file_for_year <- unzipped_files[grepl(paste0("HWY", x), + unzipped_files, ignore.case = TRUE) + & grepl("\\.shp$", unzipped_files)] # Process only the shapefile that matches the current year if (length(shp_file_for_year) == 1) { diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 86cae8342..ccf0ef236 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -7,9 +7,11 @@ library(geoarrow) AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") +output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, + "spatial", "environment", "traffic") -files_in_s3 <- get_bucket_df(bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) +files_in_s3 <- get_bucket_df( + bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) # Get the 'Key' parquet_files <- files_in_s3 %>% @@ -32,12 +34,16 @@ for (file_key in parquet_files) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # We do this because some columns are not present in older versions of the data - required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY", "geometry_3435") + # We do this because some columns are not present in + # older versions of the data + required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY", "geometry_3435") # Select only the non-geometry columns that exist in the dataset - existing_columns <- intersect(required_columns, colnames(shapefile_data)) + existing_columns <- intersect(required_columns, + colnames(shapefile_data)) selected_columns <- shapefile_data %>% select(all_of(existing_columns)) @@ -51,7 +57,8 @@ for (file_key in parquet_files) { output_key <- file.path(output_bucket, basename(file_key)) # Upload the processed file to the S3 output bucket - put_object(file = output_file, object = output_key, bucket = AWS_S3_WAREHOUSE_BUCKET) + put_object(file = output_file, object = output_key, + bucket = AWS_S3_WAREHOUSE_BUCKET) # Clean up the temporary files unlink(output_file) From 59c180de3e0d82f01917245411f2d0220ed0394c Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 15:04:56 +0000 Subject: [PATCH 17/74] lintr --- .../spatial/spatial-environment-traffic.R | 13 ++++++++----- .../spatial/spatial-environment-traffic.R | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 204e44d33..bd5a9328e 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -13,7 +13,7 @@ output_bucket <- file.path(AWS_S3_RAW_BUCKET, current_year <- strftime(Sys.Date(), "%Y") # Get list of available files -years <- map(2012:year(Sys.Date()), \(x){ +years <- map(2012:year(Sys.Date()), \(x) { if (HEAD(paste0( "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip" @@ -33,7 +33,7 @@ process_shapefiles_for_year <- map(years, \(x) { if (!object_exists(remote_file_path)) { # Define the URL for the shapefile ZIP file, dynamically for each year url <- paste0( - "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") + "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") # Create a temporary file to store the downloaded ZIP temp_zip <- tempfile(fileext = ".zip") @@ -46,12 +46,15 @@ process_shapefiles_for_year <- map(years, \(x) { # Unzip the file into a temporary directory unzip(temp_zip, exdir = temp_dir) - message(paste("Shapefile for year", x, "unzipped into temporary directory.")) + message(paste("Shapefile for year", x, + "unzipped into temporary directory.")) # List files in the unzipped directory and look for the .shp files unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) - shp_file_for_year <- unzipped_files[grepl(paste0("HWY", x), - unzipped_files, ignore.case = TRUE) + shp_file_for_year <- unzipped_files[grepl(paste0("HWY", + x), + unzipped_files, + ignore.case = TRUE) & grepl("\\.shp$", unzipped_files)] # Process only the shapefile that matches the current year diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index ccf0ef236..8081f6ccb 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -11,7 +11,7 @@ output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic") files_in_s3 <- get_bucket_df( - bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) + bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) # Get the 'Key' parquet_files <- files_in_s3 %>% From de905ef91d4ea5d12d47e8474aa668e8384e82eb Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 8 Oct 2024 15:48:08 +0000 Subject: [PATCH 18/74] Change geoparquet function --- etl/renv.lock | 4 +- .../spatial/spatial-environment-traffic.R | 76 ++++++++----------- 2 files changed, 34 insertions(+), 46 deletions(-) diff --git a/etl/renv.lock b/etl/renv.lock index 80382383c..2bbbc4f11 100644 --- a/etl/renv.lock +++ b/etl/renv.lock @@ -1943,7 +1943,7 @@ "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", - "RemoteUsername": "DyfanJones", + "RemoteUsername": "dyfanjones", "RemoteRepo": "noctua", "RemoteRef": "master", "RemoteSha": "23a4cfbf537407c7a1547fc13ba771ba2eb098e0", @@ -1957,7 +1957,7 @@ "utils", "uuid" ], - "Hash": "a48e1decdd027c44ea6b97b0fe0950cb" + "Hash": "b3fc482d0ae2f51ed324fd3da66471b4" }, "numDeriv": { "Package": "numDeriv", diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 8081f6ccb..f1b09c4db 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -1,65 +1,53 @@ library(aws.s3) library(dplyr) +library(purrr) library(sf) library(geoarrow) # Define the S3 bucket and folder path AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -s3_folder <- "spatial/environment/traffic/" -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, - "spatial", "environment", "traffic") - -files_in_s3 <- get_bucket_df( - bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder) +output_bucket <- file.path( + AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic" + ) # Get the 'Key' -parquet_files <- files_in_s3 %>% +parquet_files <- get_bucket_df( + bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder + ) %>% pull(Key) # Loop through each parquet file and process it -for (file_key in parquet_files) { - - # Read the parquet file directly from S3 using aws.s3 functions - obj <- get_object(object = file_key, bucket = AWS_S3_RAW_BUCKET) - - # Convert the S3 object into raw data and read using geoarrow - shapefile_data <- geoarrow::read_geoparquet(rawConnection(obj)) - - # Convert geometry column to 'sf' format - shapefile_data$geometry <- st_as_sfc(shapefile_data$geometry) +walk(parquet_files, \(file_key) { - shapefile_data <- shapefile_data %>% - st_as_sf() %>% - st_transform(4326) %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) + if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, file_key))) { - # We do this because some columns are not present in - # older versions of the data - required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", - "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435") + print(paste("Cleaning", file_key)) - # Select only the non-geometry columns that exist in the dataset - existing_columns <- intersect(required_columns, - colnames(shapefile_data)) - selected_columns <- shapefile_data %>% - select(all_of(existing_columns)) + # Convert the S3 object into raw data and read using geoarrow + shapefile_data <- geoarrow::read_geoparquet_sf( + file.path(AWS_S3_RAW_BUCKET, file_key) + ) %>% + st_transform(4326) %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) - # Create a temporary file for saving the processed data - output_file <- tempfile(fileext = ".parquet") + # We do this because some columns are not present in + # older versions of the data + required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY", "geometry_3435") - # Write the selected columns to a new parquet file - geoarrow::write_geoparquet(selected_columns, output_file) + # Select only the non-geometry columns that exist in the dataset + existing_columns <- intersect(required_columns, colnames(shapefile_data)) + shapefile_data %>% + select(all_of(existing_columns)) %>% + geoarrow::write_geoparquet( + file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) + ) - # Define the output file path in the S3 bucket - output_key <- file.path(output_bucket, basename(file_key)) + print(paste(file_key, "cleaned and uploaded.")) - # Upload the processed file to the S3 output bucket - put_object(file = output_file, object = output_key, - bucket = AWS_S3_WAREHOUSE_BUCKET) + } - # Clean up the temporary files - unlink(output_file) -} +}) From 418ef5e970464470110e38f93922483961dbf2a7 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:22:24 +0000 Subject: [PATCH 19/74] Add filter for Cook County --- .../spatial/spatial-environment-traffic.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index bd5a9328e..94c26051e 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -60,7 +60,8 @@ process_shapefiles_for_year <- map(years, \(x) { # Process only the shapefile that matches the current year if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read - shapefile_data <- sf::st_read(shp_file_for_year) + shapefile_data <- sf::st_read(shp_file_for_year) %>% + filter(INV_CO == '016') # Save the shapefile as a GeoParquet file geoarrow::write_geoparquet(shapefile_data, remote_file_path) From 45b7929a07a46686b507bba5f2348f98abbaed89 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:23:07 +0000 Subject: [PATCH 20/74] Comment --- .../spatial/spatial-environment-traffic.R | 1 + 1 file changed, 1 insertion(+) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 94c26051e..1e1483467 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -61,6 +61,7 @@ process_shapefiles_for_year <- map(years, \(x) { if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% + # Add filter for Cook County filter(INV_CO == '016') # Save the shapefile as a GeoParquet file From c6961d08baecce5226492ba38c8b65212a97d5f5 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:39:44 +0000 Subject: [PATCH 21/74] Add if-else statement for County --- .../spatial/spatial-environment-traffic.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 1e1483467..13202a921 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -61,8 +61,9 @@ process_shapefiles_for_year <- map(years, \(x) { if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% - # Add filter for Cook County - filter(INV_CO == '016') + # Add filter for Cook County. The name changes in different years + filter(if ("COUNTY" %in% names(df)) + COUNTY == '016' else INV_CO == '016') # Save the shapefile as a GeoParquet file geoarrow::write_geoparquet(shapefile_data, remote_file_path) From c0fa4a8d9d6a7d4d1584bb81d3ec1193c2f25463 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:42:06 +0000 Subject: [PATCH 22/74] rename data to shapefile_data --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 13202a921..a7fdf4d3f 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -62,7 +62,7 @@ process_shapefiles_for_year <- map(years, \(x) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% # Add filter for Cook County. The name changes in different years - filter(if ("COUNTY" %in% names(df)) + filter(if ("COUNTY" %in% names(shapefile_data)) COUNTY == '016' else INV_CO == '016') # Save the shapefile as a GeoParquet file From 266ba86dfd0cb3f8262e4ae5f282d59e628dfded Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:44:23 +0000 Subject: [PATCH 23/74] Include period instead of named file --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index a7fdf4d3f..9b4a98631 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -62,7 +62,7 @@ process_shapefiles_for_year <- map(years, \(x) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% # Add filter for Cook County. The name changes in different years - filter(if ("COUNTY" %in% names(shapefile_data)) + filter(if ("COUNTY" %in% names(.)) COUNTY == '016' else INV_CO == '016') # Save the shapefile as a GeoParquet file From cc5a18f05d2b0d9cffed97ee3a0fd1876262237f Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 16:45:14 +0000 Subject: [PATCH 24/74] period --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 9b4a98631..9a308f74b 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -61,7 +61,7 @@ process_shapefiles_for_year <- map(years, \(x) { if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% - # Add filter for Cook County. The name changes in different years + # Add filter for Cook County. The name changes in different years. filter(if ("COUNTY" %in% names(.)) COUNTY == '016' else INV_CO == '016') From 2cb1e0c35d7607f0715f52060216a273cf69c44f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 8 Oct 2024 18:21:32 +0000 Subject: [PATCH 25/74] Fix s3 pathing --- .../spatial/spatial-environment-traffic.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index f1b09c4db..f5e57db6c 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -7,9 +7,8 @@ library(geoarrow) # Define the S3 bucket and folder path AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -output_bucket <- file.path( - AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment", "traffic" - ) +s3_folder <- "spatial/environment/traffic/" +output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) # Get the 'Key' parquet_files <- get_bucket_df( From 683a5519432d007823ae53234e5176430d88a341 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 18:35:54 +0000 Subject: [PATCH 26/74] Start dbt schema --- ...ximity.dist_pin_to_traffic_speed_limit.sql | 24 +++++++++++++++++++ dbt/models/spatial/docs.md | 8 +++++++ dbt/models/spatial/schema.yml | 3 +++ 3 files changed, 35 insertions(+) create mode 100644 dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql diff --git a/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql b/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql new file mode 100644 index 000000000..f71306592 --- /dev/null +++ b/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql @@ -0,0 +1,24 @@ +-- CTAS to create a table of distance to the nearest rail tracks for each PIN +{{ + config( + materialized='table', + partitioned_by=['year'], + bucketed_by=['pin10'], + bucket_count=1 + ) +}} + +SELECT + pcl.pin10, + ARBITRARY(xy.name_id) AS nearest_road_name, + ARBITRARY(xy.dist_ft) AS nearest_speed_limit_dist_ft, + ARBITRARY(xy.year) AS nearest_speed_limit_data_year, + pcl.year +FROM {{ source('spatial', 'parcel') }} AS pcl +INNER JOIN + ( {{ dist_to_nearest_geometry(source('spatial', 'traffic')) }} ) AS xy + ON pcl.x_3435 = xy.x_3435 + AND pcl.y_3435 = xy.y_3435 + AND pcl.year = xy.pin_year + AND xy.sp_lim > 0 +GROUP BY pcl.pin10, pcl.year diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index 096221bb8..2a1463741 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -493,6 +493,14 @@ Includes townships within the City of Chicago, which are technically defunct. **Geometry:** `MULTIPOLYGON` {% enddocs %} +# traffic + +{% docs table_traffic %} +Locations of roads derived from the Illinois Department of Revenue website + +**Geometry:** `MULTILINESTRING` +{% enddocs %} + # transit_dict {% docs table_transit_dict %} diff --git a/dbt/models/spatial/schema.yml b/dbt/models/spatial/schema.yml index 0982b8106..513900073 100644 --- a/dbt/models/spatial/schema.yml +++ b/dbt/models/spatial/schema.yml @@ -174,6 +174,9 @@ sources: - name: township description: '{{ doc("table_township") }}' + - name: traffic + description: '{{ doc("table_traffic") }}' + - name: transit_dict description: '{{ doc("table_transit_dict") }}' From 76bfe5bc09b942100a85c133427135268ac6055a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 19:33:03 +0000 Subject: [PATCH 27/74] Add year --- .../spatial/spatial-environment-traffic.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 9a308f74b..67a5fff62 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -63,7 +63,8 @@ process_shapefiles_for_year <- map(years, \(x) { shapefile_data <- sf::st_read(shp_file_for_year) %>% # Add filter for Cook County. The name changes in different years. filter(if ("COUNTY" %in% names(.)) - COUNTY == '016' else INV_CO == '016') + COUNTY == '016' else INV_CO == '016') %>% + mutate(year = x) # Save the shapefile as a GeoParquet file geoarrow::write_geoparquet(shapefile_data, remote_file_path) From c1d57e8d7ec5eab99b306259d9823ee6807982fa Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 19:49:55 +0000 Subject: [PATCH 28/74] Fix path --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index f5e57db6c..67962f2df 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -19,7 +19,7 @@ parquet_files <- get_bucket_df( # Loop through each parquet file and process it walk(parquet_files, \(file_key) { - if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, file_key))) { + if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder, file_key))) { print(paste("Cleaning", file_key)) From 89c69e72f0aca8aed2d772b86b7ced2fcf63492c Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 8 Oct 2024 20:30:33 +0000 Subject: [PATCH 29/74] Add renaming --- .../spatial/spatial-environment-traffic.R | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 67962f2df..8b55b5ddc 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -32,15 +32,34 @@ walk(parquet_files, \(file_key) { # We do this because some columns are not present in # older versions of the data - required_columns <- c("LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + required_columns <- c("FCNAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435") + "SP_LIM", "INVENTORY", "geometry_3435", "year") # Select only the non-geometry columns that exist in the dataset existing_columns <- intersect(required_columns, colnames(shapefile_data)) - shapefile_data %>% + shapefile_data_test <- shapefile_data %>% select(all_of(existing_columns)) %>% + mutate( + road_type = if ("FCNAME" %in% colnames(.)) FCNAME else NA, + lanes = if ("LNS" %in% colnames(.)) LNS else NA, + surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, + surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, + surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, + annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, + condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, + condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, + condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, + road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, + distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, + distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, + speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, + inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA + ) %>% + select(-one_of(c("FCNAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", + "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY"))) %>% geoarrow::write_geoparquet( file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) ) From cb883e4071f3b7548acefcb09921b667eae1b83a Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 9 Oct 2024 15:21:51 +0000 Subject: [PATCH 30/74] Remove unecessary code --- .../spatial/spatial-environment-traffic.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 67a5fff62..6b7b82178 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -76,5 +76,3 @@ process_shapefiles_for_year <- map(years, \(x) { } }) - -unlink(temp_dir, recursive = TRUE) From 2014cbeda72a975519756c89373e61ed2e3b433f Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 9 Oct 2024 15:38:22 +0000 Subject: [PATCH 31/74] Delete file --- ...ximity.dist_pin_to_traffic_speed_limit.sql | 24 ------------------- 1 file changed, 24 deletions(-) delete mode 100644 dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql diff --git a/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql b/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql deleted file mode 100644 index f71306592..000000000 --- a/dbt/models/proximity/proximity.dist_pin_to_traffic_speed_limit.sql +++ /dev/null @@ -1,24 +0,0 @@ --- CTAS to create a table of distance to the nearest rail tracks for each PIN -{{ - config( - materialized='table', - partitioned_by=['year'], - bucketed_by=['pin10'], - bucket_count=1 - ) -}} - -SELECT - pcl.pin10, - ARBITRARY(xy.name_id) AS nearest_road_name, - ARBITRARY(xy.dist_ft) AS nearest_speed_limit_dist_ft, - ARBITRARY(xy.year) AS nearest_speed_limit_data_year, - pcl.year -FROM {{ source('spatial', 'parcel') }} AS pcl -INNER JOIN - ( {{ dist_to_nearest_geometry(source('spatial', 'traffic')) }} ) AS xy - ON pcl.x_3435 = xy.x_3435 - AND pcl.y_3435 = xy.y_3435 - AND pcl.year = xy.pin_year - AND xy.sp_lim > 0 -GROUP BY pcl.pin10, pcl.year From 102b721b1a86e83de76cf493273769aae920da88 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 9 Oct 2024 15:39:30 +0000 Subject: [PATCH 32/74] Remove docs --- dbt/models/spatial/docs.md | 8 -------- dbt/models/spatial/schema.yml | 3 --- 2 files changed, 11 deletions(-) diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index 2a1463741..096221bb8 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -493,14 +493,6 @@ Includes townships within the City of Chicago, which are technically defunct. **Geometry:** `MULTIPOLYGON` {% enddocs %} -# traffic - -{% docs table_traffic %} -Locations of roads derived from the Illinois Department of Revenue website - -**Geometry:** `MULTILINESTRING` -{% enddocs %} - # transit_dict {% docs table_transit_dict %} diff --git a/dbt/models/spatial/schema.yml b/dbt/models/spatial/schema.yml index 513900073..0982b8106 100644 --- a/dbt/models/spatial/schema.yml +++ b/dbt/models/spatial/schema.yml @@ -174,9 +174,6 @@ sources: - name: township description: '{{ doc("table_township") }}' - - name: traffic - description: '{{ doc("table_traffic") }}' - - name: transit_dict description: '{{ doc("table_transit_dict") }}' From ad6ed4626ea73990669e950b5311bb738c060e3a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 9 Oct 2024 16:10:11 +0000 Subject: [PATCH 33/74] Use FC_NAME and FCNAME --- .../spatial/spatial-environment-traffic.R | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 8b55b5ddc..7dd12bcd4 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -30,42 +30,51 @@ walk(parquet_files, \(file_key) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # We do this because some columns are not present in - # older versions of the data - required_columns <- c("FCNAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", - "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435", "year") - - # Select only the non-geometry columns that exist in the dataset - existing_columns <- intersect(required_columns, colnames(shapefile_data)) - shapefile_data_test <- shapefile_data %>% - select(all_of(existing_columns)) %>% - mutate( - road_type = if ("FCNAME" %in% colnames(.)) FCNAME else NA, - lanes = if ("LNS" %in% colnames(.)) LNS else NA, - surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, - surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, - surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, - annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, - condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, - condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, - condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, - road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, - distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, - distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, - speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, - inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA + # Convert the S3 object into raw data and read using geoarrow + shapefile_data <- geoarrow::read_geoparquet_sf( + file.path(AWS_S3_RAW_BUCKET, file_key) ) %>% - select(-one_of(c("FCNAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", - "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY"))) %>% - geoarrow::write_geoparquet( - file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) - ) + st_transform(4326) %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) + + + # We do this because some columns are not present in + # older versions of the data + required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY", "geometry_3435", "year") + + # Select only the non-geometry columns that exist in the dataset + existing_columns <- intersect(required_columns, colnames(shapefile_data)) + shapefile_data %>% + select(all_of(existing_columns)) %>% + mutate( + road_type = if ("FCNAME" %in% colnames(.)) FCNAME else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, + lanes = if ("LNS" %in% colnames(.)) LNS else NA, + surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, + surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, + surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, + annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, + condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, + condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, + condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, + road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, + distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, + distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, + speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, + inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA + ) %>% + select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", + "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY"))) %>% + geoarrow::write_geoparquet( + file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) + ) print(paste(file_key, "cleaned and uploaded.")) + } } }) From 47b04698ba30e58edfe506204d031332b3445b88 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 9 Oct 2024 16:11:02 +0000 Subject: [PATCH 34/74] Fix brackets --- .../spatial/spatial-environment-traffic.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 7dd12bcd4..6e3ee51b0 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -76,5 +76,4 @@ walk(parquet_files, \(file_key) { } } - -}) +) From 20b9c4c9d602ad7ad4580fca9845035951f94d54 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 9 Oct 2024 18:11:57 +0000 Subject: [PATCH 35/74] Get back to running --- .../spatial/spatial-environment-traffic.R | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 6e3ee51b0..4e7b042d3 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -13,13 +13,13 @@ output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) # Get the 'Key' parquet_files <- get_bucket_df( bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder - ) %>% +) %>% pull(Key) # Loop through each parquet file and process it walk(parquet_files, \(file_key) { - if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder, file_key))) { + if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, file_key))) { print(paste("Cleaning", file_key)) @@ -30,50 +30,50 @@ walk(parquet_files, \(file_key) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # Convert the S3 object into raw data and read using geoarrow - shapefile_data <- geoarrow::read_geoparquet_sf( - file.path(AWS_S3_RAW_BUCKET, file_key) - ) %>% - st_transform(4326) %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) + # Convert the S3 object into raw data and read using geoarrow + # shapefile_data <- geoarrow::read_geoparquet_sf( + # file.path(AWS_S3_RAW_BUCKET, file_key) + # ) %>% + # st_transform(4326) %>% + # mutate(geometry_3435 = st_transform(geometry, 3435)) - # We do this because some columns are not present in - # older versions of the data - required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", - "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435", "year") + # We do this because some columns are not present in + # older versions of the data + required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + "CRS_WITH", "CRS_OPP", "CRS_YR", + "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY", "geometry_3435", "year") - # Select only the non-geometry columns that exist in the dataset - existing_columns <- intersect(required_columns, colnames(shapefile_data)) - shapefile_data %>% - select(all_of(existing_columns)) %>% - mutate( - road_type = if ("FCNAME" %in% colnames(.)) FCNAME else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, - lanes = if ("LNS" %in% colnames(.)) LNS else NA, - surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, - surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, - surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, - annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, - condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, - condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, - condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, - road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, - distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, - distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, - speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, - inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA - ) %>% - select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", - "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY"))) %>% - geoarrow::write_geoparquet( - file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) - ) + # Select only the non-geometry columns that exist in the dataset + existing_columns <- intersect(required_columns, colnames(shapefile_data)) + shapefile_data %>% + select(all_of(existing_columns)) %>% + mutate( + road_type = if ("FCNAME" %in% colnames(.)) FCNAME else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, + lanes = if ("LNS" %in% colnames(.)) LNS else NA, + surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, + surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, + surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, + annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, + condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, + condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, + condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, + road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, + distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, + distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, + speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, + inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA + ) %>% + select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", + "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY"))) %>% + geoarrow::write_geoparquet( + file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) + ) print(paste(file_key, "cleaned and uploaded.")) - } } +} ) From 369e228f9606401c43ee6f46ee11b4ab54c3c40c Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 9 Oct 2024 18:43:24 +0000 Subject: [PATCH 36/74] Make year a character --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 6b7b82178..0891a53ba 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -64,7 +64,7 @@ process_shapefiles_for_year <- map(years, \(x) { # Add filter for Cook County. The name changes in different years. filter(if ("COUNTY" %in% names(.)) COUNTY == '016' else INV_CO == '016') %>% - mutate(year = x) + mutate(year = as.character(x)) # Save the shapefile as a GeoParquet file geoarrow::write_geoparquet(shapefile_data, remote_file_path) From f07d62f08d3189c7adfe0a5565d9a91f617e0977 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 18:08:20 +0000 Subject: [PATCH 37/74] Add NA handeling --- .../spatial/spatial-environment-traffic.R | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 4e7b042d3..984700209 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -10,6 +10,55 @@ AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) +# Recoding of road data +road_codes <- c( + "762" = "Reinforced over PCC - Reinforcement unknown", + "765" = "Non-Reinforced over PCC - No reinforcement", + "767" = "Reinforced over PCC - No reinforcement", + "770" = "Non-Reinforced over PCC - Partial reinforcement", + "772" = "Reinforced over PCC - Partial reinforcement", + "775" = "Non-Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", + "777" = "Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", + "780" = "Non-Reinforced over PCC - Full reinforcement", + "782" = "Reinforced over PCC - Full reinforcement", + "790" = "Non-Reinforced over PCC - Continuous reinforcement", + "792" = "Reinforced over PCC - Continuous reinforcement", + "600" = "Over PCC - Reinforcement unknown", + "610" = "Over PCC - No reinforcement", + "615" = "Over PCC - No reinforcement but having short panels and dowels", + "620" = "Over PCC - Partial reinforcement", + "625" = "Over PCC - With No or Partial Reinforcement - But having Hinged Joints", + "630" = "Over PCC - Full reinforcement", + "640" = "Over PCC - Continuous reinforcement", + "650" = "Over Brick, Block, Steel, or similar material", + "700" = "Reinforcement unknown", + "710" = "No reinforcement", + "720" = "Partial reinforcement", + "725" = "With No or Partial reinforcement but having Hinged Joints", + "730" = "Full reinforcement", + "740" = "Continuous reinforcement", + "760" = "Non-Reinforced over PCC - Reinforcement unknown", + "400" = "Mixed Bituminous (low type bituminous)", + "410" = "Bituminous Penetration (low type bituminous)", + "500" = "Bituminous Surface Treated – Mixed bituminous", + "501" = "Over PCC - Rubblized - Reinforcement unknown", + "510" = "Over PCC - Rubblized - No reinforcement", + "520" = "Over PCC - Rubblized - Partial reinforcement", + "525" = "Over PCC - Rubblized - With No or Partial Reinforcement - But having Hinged Joints", + "530" = "Over PCC - Rubblized - Full reinforcement", + "540" = "Over PCC - Rubblized - Continuous reinforcement", + "550" = "Bituminous Concrete (other than Class I)", + "560" = "Bituminous Concrete Pavement (Full-Depth)", + "100" = "Without dust palliative treatment", + "110" = "With dust palliative (oiled)", + "200" = "Without dust palliative treatment", + "210" = "With dust palliative treatment", + "300" = "Bituminous Surface-Treated (low type bituminous)", + "010" = "Unimproved", + "020" = "Graded and Drained" +) + + # Get the 'Key' parquet_files <- get_bucket_df( bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder @@ -65,9 +114,14 @@ walk(parquet_files, \(file_key) { speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% + # Recode surface_type based on road codes + mutate(surface_type = road_codes[as.character(surface_type)]) %>% + # Select and remove unnecessary columns select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY"))) %>% + # Replace all 0 values with NA, excluding the geometry column + mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% geoarrow::write_geoparquet( file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) ) From 6ec7267101746366661cc141503fe43bb9b5d035 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 18:14:11 +0000 Subject: [PATCH 38/74] Reorder columns --- .../spatial/spatial-environment-traffic.R | 59 ++++++++++--------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 984700209..10ae58dfc 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -12,17 +12,24 @@ output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) # Recoding of road data road_codes <- c( - "762" = "Reinforced over PCC - Reinforcement unknown", - "765" = "Non-Reinforced over PCC - No reinforcement", - "767" = "Reinforced over PCC - No reinforcement", - "770" = "Non-Reinforced over PCC - Partial reinforcement", - "772" = "Reinforced over PCC - Partial reinforcement", - "775" = "Non-Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", - "777" = "Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", - "780" = "Non-Reinforced over PCC - Full reinforcement", - "782" = "Reinforced over PCC - Full reinforcement", - "790" = "Non-Reinforced over PCC - Continuous reinforcement", - "792" = "Reinforced over PCC - Continuous reinforcement", + "010" = "Unimproved", + "020" = "Graded and Drained", + "100" = "Without dust palliative treatment", + "110" = "With dust palliative (oiled)", + "200" = "Without dust palliative treatment", + "210" = "With dust palliative treatment", + "300" = "Bituminous Surface-Treated (low type bituminous)", + "400" = "Mixed Bituminous (low type bituminous)", + "410" = "Bituminous Penetration (low type bituminous)", + "500" = "Bituminous Surface Treated – Mixed bituminous", + "501" = "Over PCC - Rubblized - Reinforcement unknown", + "510" = "Over PCC - Rubblized - No reinforcement", + "520" = "Over PCC - Rubblized - Partial reinforcement", + "525" = "Over PCC - Rubblized - With No or Partial Reinforcement - But having Hinged Joints", + "530" = "Over PCC - Rubblized - Full reinforcement", + "540" = "Over PCC - Rubblized - Continuous reinforcement", + "550" = "Bituminous Concrete (other than Class I)", + "560" = "Bituminous Concrete Pavement (Full-Depth)", "600" = "Over PCC - Reinforcement unknown", "610" = "Over PCC - No reinforcement", "615" = "Over PCC - No reinforcement but having short panels and dowels", @@ -38,24 +45,18 @@ road_codes <- c( "730" = "Full reinforcement", "740" = "Continuous reinforcement", "760" = "Non-Reinforced over PCC - Reinforcement unknown", - "400" = "Mixed Bituminous (low type bituminous)", - "410" = "Bituminous Penetration (low type bituminous)", - "500" = "Bituminous Surface Treated – Mixed bituminous", - "501" = "Over PCC - Rubblized - Reinforcement unknown", - "510" = "Over PCC - Rubblized - No reinforcement", - "520" = "Over PCC - Rubblized - Partial reinforcement", - "525" = "Over PCC - Rubblized - With No or Partial Reinforcement - But having Hinged Joints", - "530" = "Over PCC - Rubblized - Full reinforcement", - "540" = "Over PCC - Rubblized - Continuous reinforcement", - "550" = "Bituminous Concrete (other than Class I)", - "560" = "Bituminous Concrete Pavement (Full-Depth)", - "100" = "Without dust palliative treatment", - "110" = "With dust palliative (oiled)", - "200" = "Without dust palliative treatment", - "210" = "With dust palliative treatment", - "300" = "Bituminous Surface-Treated (low type bituminous)", - "010" = "Unimproved", - "020" = "Graded and Drained" + "762" = "Reinforced over PCC - Reinforcement unknown", + "765" = "Non-Reinforced over PCC - No reinforcement", + "767" = "Reinforced over PCC - No reinforcement", + "770" = "Non-Reinforced over PCC - Partial reinforcement", + "772" = "Reinforced over PCC - Partial reinforcement", + "775" = "Non-Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", + "777" = "Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", + "780" = "Non-Reinforced over PCC - Full reinforcement", + "782" = "Reinforced over PCC - Full reinforcement", + "790" = "Non-Reinforced over PCC - Continuous reinforcement", + "792" = "Reinforced over PCC - Continuous reinforcement", + "800" = "800 Brick, Block or Other" ) From b402b7709dbecd045fe286b265e4716f960b039f Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 18:20:35 +0000 Subject: [PATCH 39/74] Add docs --- dbt/models/spatial/docs.md | 11 +++++++++++ dbt/models/spatial/schema.yml | 3 +++ 2 files changed, 14 insertions(+) diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index 096221bb8..c4fa435f2 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -493,6 +493,17 @@ Includes townships within the City of Chicago, which are technically defunct. **Geometry:** `MULTIPOLYGON` {% enddocs %} +# traffic + +{% docs table_traffic %} + +Illinois Department of Transportation data source from +[https://apps1.dot.illinois.gov/gist2/](https://apps1.dot.illinois.gov/gist2/). + + +**Geometry:** `MULTILINESTRING` +{% enddocs %} + # transit_dict {% docs table_transit_dict %} diff --git a/dbt/models/spatial/schema.yml b/dbt/models/spatial/schema.yml index 0982b8106..513900073 100644 --- a/dbt/models/spatial/schema.yml +++ b/dbt/models/spatial/schema.yml @@ -174,6 +174,9 @@ sources: - name: township description: '{{ doc("table_township") }}' + - name: traffic + description: '{{ doc("table_traffic") }}' + - name: transit_dict description: '{{ doc("table_transit_dict") }}' From 1dfd84b39cec378c580b08faff231870a68c4b95 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 18:21:10 +0000 Subject: [PATCH 40/74] better commenting --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 10ae58dfc..4780f7136 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -10,7 +10,7 @@ AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) -# Recoding of road data +# Recoding of road type road_codes <- c( "010" = "Unimproved", "020" = "Graded and Drained", From 22741ab49d571d7a6f94b676ab2897fbfa7527f2 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 19:31:44 +0000 Subject: [PATCH 41/74] Remove duplicated code --- .../spatial/spatial-environment-traffic.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 4780f7136..4154ddd26 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -80,14 +80,6 @@ walk(parquet_files, \(file_key) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # Convert the S3 object into raw data and read using geoarrow - # shapefile_data <- geoarrow::read_geoparquet_sf( - # file.path(AWS_S3_RAW_BUCKET, file_key) - # ) %>% - # st_transform(4326) %>% - # mutate(geometry_3435 = st_transform(geometry, 3435)) - - # We do this because some columns are not present in # older versions of the data required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", From c0450fffbb6dbac26e4a1ea706d5e4478655dd27 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 19:36:11 +0000 Subject: [PATCH 42/74] Rename SURF_YR --- .../spatial/spatial-environment-traffic.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 4154ddd26..64d0361cd 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -82,7 +82,7 @@ walk(parquet_files, \(file_key) { # We do this because some columns are not present in # older versions of the data - required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", + required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY", "geometry_3435", "year") @@ -96,7 +96,7 @@ walk(parquet_files, \(file_key) { lanes = if ("LNS" %in% colnames(.)) LNS else NA, surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, - surface_year = if ("SRF_YR" %in% colnames(.)) SRF_YR else NA, + surface_year = if ("SURF_YR" %in% colnames(.)) SURF_YR else NA, annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, @@ -110,7 +110,7 @@ walk(parquet_files, \(file_key) { # Recode surface_type based on road codes mutate(surface_type = road_codes[as.character(surface_type)]) %>% # Select and remove unnecessary columns - select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SRF_YR", "AADT", "CRS_WITH", + select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", "SP_LIM", "INVENTORY"))) %>% # Replace all 0 values with NA, excluding the geometry column From cfafd128dbbfaee76dc9851f1d17d53aea11c09a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 19:46:03 +0000 Subject: [PATCH 43/74] Better renaming --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 64d0361cd..8cbcd5065 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -56,7 +56,7 @@ road_codes <- c( "782" = "Reinforced over PCC - Full reinforcement", "790" = "Non-Reinforced over PCC - Continuous reinforcement", "792" = "Reinforced over PCC - Continuous reinforcement", - "800" = "800 Brick, Block or Other" + "800" = "Brick, Block or Other" ) From 8af8f076a090bdd6cbbb5a6bd10a179820ccf419 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 10 Oct 2024 20:48:26 +0000 Subject: [PATCH 44/74] rename traffic, fix surface_year --- .../spatial/spatial-environment-traffic.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 8cbcd5065..322e106ed 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -97,7 +97,7 @@ walk(parquet_files, \(file_key) { surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, surface_year = if ("SURF_YR" %in% colnames(.)) SURF_YR else NA, - annual_traffic = if ("AADT" %in% colnames(.)) AADT else NA, + daily_traffic = if ("AADT" %in% colnames(.)) AADT else NA, condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, @@ -115,6 +115,7 @@ walk(parquet_files, \(file_key) { "SP_LIM", "INVENTORY"))) %>% # Replace all 0 values with NA, excluding the geometry column mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% + mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% geoarrow::write_geoparquet( file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) ) From 75b1bbb2fd4202e909293b26a6c8543cbe85b1ac Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 15:50:35 +0000 Subject: [PATCH 45/74] Add mean values --- .../spatial/spatial-environment-traffic.R | 127 +++++++++++++++--- 1 file changed, 105 insertions(+), 22 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 322e106ed..246186141 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -59,7 +59,6 @@ road_codes <- c( "800" = "Brick, Block or Other" ) - # Get the 'Key' parquet_files <- get_bucket_df( bucket = AWS_S3_RAW_BUCKET, prefix = s3_folder @@ -80,16 +79,14 @@ walk(parquet_files, \(file_key) { st_transform(4326) %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # We do this because some columns are not present in - # older versions of the data - required_columns <- c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", - "CRS_WITH", "CRS_OPP", "CRS_YR", - "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435", "year") + required_columns <- c( + "FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", + "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", + "SP_LIM", "INVENTORY", "geometry_3435", "year" + ) - # Select only the non-geometry columns that exist in the dataset existing_columns <- intersect(required_columns, colnames(shapefile_data)) - shapefile_data %>% + shapefile_data <- shapefile_data %>% select(all_of(existing_columns)) %>% mutate( road_type = if ("FCNAME" %in% colnames(.)) FCNAME else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, @@ -107,21 +104,107 @@ walk(parquet_files, \(file_key) { speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% - # Recode surface_type based on road codes - mutate(surface_type = road_codes[as.character(surface_type)]) %>% - # Select and remove unnecessary columns - select(-one_of(c("FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", "CRS_WITH", - "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY"))) %>% - # Replace all 0 values with NA, excluding the geometry column + mutate(surface_type = road_codes[as.character(surface_type)], + speed_limit = as.numeric(speed_limit)) %>% + select(-one_of(required_columns)) %>% mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% - geoarrow::write_geoparquet( - file.path(AWS_S3_WAREHOUSE_BUCKET, file_key) - ) + group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% + summarize(geometry = st_union(geometry)) %>% + ungroup() - print(paste(file_key, "cleaned and uploaded.")) + # Function to create the intersection matrix and compute average traffic + calculate_traffic_averages <- function(shapefile_data) { + # Create an intersection matrix for averages + intersection_matrix <- st_intersects(shapefile_data) + + # Create intersecting pairs + intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { + data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) + })) %>% + filter(polygon_1 != polygon_2) # Remove self-matches + + # Add polygon ID and relevant columns to shapefile data + shapefile_with_ids <- shapefile_data %>% + mutate(polygon_id = row_number()) %>% + select(polygon_id, road_name, daily_traffic, speed_limit, lanes) + + # Join intersecting pairs with matching street names + averages <- intersecting_pairs %>% + left_join( + shapefile_with_ids %>% + rename( + road_name_1 = road_name, + daily_traffic_1 = daily_traffic, + speed_limit_1 = speed_limit, + lanes_1 = lanes + ), + by = c("polygon_1" = "polygon_id") + ) %>% + left_join( + shapefile_with_ids %>% + rename( + road_name_2 = road_name, + daily_traffic_2 = daily_traffic, + speed_limit_2 = speed_limit, + lanes_2 = lanes + ), + by = c("polygon_2" = "polygon_id") + ) %>% + filter(road_name_1 == road_name_2) %>% # Keep only matching road names + group_by(polygon_1) %>% + summarize( + average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), + average_speed_limit = mean(speed_limit_2, na.rm = TRUE), + average_lanes = mean(lanes_2, na.rm = TRUE), + .groups = 'drop' + ) + + # Update traffic, speed limit, and lanes with averages if needed + shapefile_data <- shapefile_data %>% + mutate(polygon_id = row_number()) %>% + left_join(averages, by = c("polygon_id" = "polygon_1")) %>% + mutate( + daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), + speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), + num_lanes = if_else(is.na(lanes), average_lanes, lanes) + ) + + return(shapefile_data) + } + + + # Loop until no changes are made + shapefile_data_final <- shapefile_data + calculate_traffic_with_loop <- function(shapefile_data) { + # Initialize final shapefile data + shapefile_data_final <- shapefile_data + repeat { + # Save current values to compare changes + previous_traffic <- shapefile_data_final$daily_traffic + previous_speed <- shapefile_data_final$speed_limit + previous_lanes <- shapefile_data_final$num_lanes + + # Recalculate averages and update shapefile data + shapefile_data_final <- calculate_traffic_averages(shapefile_data_final) + + # Check if all values remain unchanged + if (all(previous_traffic == shapefile_data_final$daily_traffic, na.rm = TRUE) && + all(previous_speed == shapefile_data_final$speed_limit, na.rm = TRUE) && + all(previous_lanes == shapefile_data_final$num_lanes, na.rm = TRUE)) { + break # Exit loop if no changes were made + } + } + + return(shapefile_data_final) + } + + + output_path <- file.path(output_bucket, basename(file_key)) + geoarrow::write_geoparquet(shapefile_data_final, output_path) + + print(paste(file_key, "cleaned and uploaded.")) } -} -) +}) + From a86203cc7e20415ed7ba1367fa033f5671ef445d Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 15:53:11 +0000 Subject: [PATCH 46/74] Update commenting --- .../spatial/spatial-environment-traffic.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 246186141..baa8d17c6 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -124,15 +124,17 @@ walk(parquet_files, \(file_key) { })) %>% filter(polygon_1 != polygon_2) # Remove self-matches - # Add polygon ID and relevant columns to shapefile data + # Add polygon ID and relevant columns to shapefile data. This allows us to later merge + # data with the intersection pairs above. shapefile_with_ids <- shapefile_data %>% mutate(polygon_id = row_number()) %>% select(polygon_id, road_name, daily_traffic, speed_limit, lanes) - # Join intersecting pairs with matching street names + # Join intersecting pairs with matching street IDs averages <- intersecting_pairs %>% left_join( shapefile_with_ids %>% + # Create IDs for the "home" street rename( road_name_1 = road_name, daily_traffic_1 = daily_traffic, @@ -143,6 +145,7 @@ walk(parquet_files, \(file_key) { ) %>% left_join( shapefile_with_ids %>% + # Create IDs for the neighboring streets rename( road_name_2 = road_name, daily_traffic_2 = daily_traffic, @@ -153,6 +156,7 @@ walk(parquet_files, \(file_key) { ) %>% filter(road_name_1 == road_name_2) %>% # Keep only matching road names group_by(polygon_1) %>% + # Create averages summarize( average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), average_speed_limit = mean(speed_limit_2, na.rm = TRUE), @@ -160,7 +164,7 @@ walk(parquet_files, \(file_key) { .groups = 'drop' ) - # Update traffic, speed limit, and lanes with averages if needed + # Update traffic, speed limit, and lanes with averages shapefile_data <- shapefile_data %>% mutate(polygon_id = row_number()) %>% left_join(averages, by = c("polygon_id" = "polygon_1")) %>% @@ -200,7 +204,6 @@ walk(parquet_files, \(file_key) { return(shapefile_data_final) } - output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data_final, output_path) From 184b0e2fd11ecfade00ce9a7c61de09e0c683209 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 16:01:27 +0000 Subject: [PATCH 47/74] Remove extra line --- .../spatial/spatial-environment-traffic.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index baa8d17c6..9a6d9fd42 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -5,7 +5,7 @@ library(sf) library(geoarrow) # Define the S3 bucket and folder path -AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +AWS_S3_RAW_BUCKET <- "s3://ccao-data-raw-us-east-1" AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) @@ -205,7 +205,7 @@ walk(parquet_files, \(file_key) { } output_path <- file.path(output_bucket, basename(file_key)) - geoarrow::write_geoparquet(shapefile_data_final, output_path) + # geoarrow::write_geoparquet(shapefile_data_final, output_path) print(paste(file_key, "cleaned and uploaded.")) } From b074eeec7f3ef0740621289c983879598f0b3d7a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 16:09:01 +0000 Subject: [PATCH 48/74] Run function --- .../spatial/spatial-environment-traffic.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 9a6d9fd42..4a49f0383 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -204,6 +204,8 @@ walk(parquet_files, \(file_key) { return(shapefile_data_final) } + calculate_traffic_with_loop(shapefile_data) + output_path <- file.path(output_bucket, basename(file_key)) # geoarrow::write_geoparquet(shapefile_data_final, output_path) From 4eb17feb1e037a90f364763b018ec9b835a080d2 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 16:09:58 +0000 Subject: [PATCH 49/74] Revert num_lanes --- .../spatial/spatial-environment-traffic.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 4a49f0383..bc5cc4fc4 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -171,7 +171,7 @@ walk(parquet_files, \(file_key) { mutate( daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), - num_lanes = if_else(is.na(lanes), average_lanes, lanes) + lanes = if_else(is.na(lanes), average_lanes, lanes) ) return(shapefile_data) @@ -188,7 +188,7 @@ walk(parquet_files, \(file_key) { # Save current values to compare changes previous_traffic <- shapefile_data_final$daily_traffic previous_speed <- shapefile_data_final$speed_limit - previous_lanes <- shapefile_data_final$num_lanes + previous_lanes <- shapefile_data_final$lanes # Recalculate averages and update shapefile data shapefile_data_final <- calculate_traffic_averages(shapefile_data_final) From 6840eb35b7b76bfd847bc4f129c889a72acdc6a7 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 17:19:16 +0000 Subject: [PATCH 50/74] Get loop working --- .../spatial/spatial-environment-traffic.R | 176 +++++++++--------- 1 file changed, 93 insertions(+), 83 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index bc5cc4fc4..087b0d496 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -5,7 +5,7 @@ library(sf) library(geoarrow) # Define the S3 bucket and folder path -AWS_S3_RAW_BUCKET <- "s3://ccao-data-raw-us-east-1" +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic/" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) @@ -105,109 +105,119 @@ walk(parquet_files, \(file_key) { inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% mutate(surface_type = road_codes[as.character(surface_type)], - speed_limit = as.numeric(speed_limit)) %>% - select(-one_of(required_columns)) %>% + speed_limit = as.numeric(speed_limit), + road_name = str_to_lower(road_name), # Convert to lowercase + road_name = gsub("[[:punct:]]", "", road_name)) %>% # Remove punctuation like . / etc. + select(-one_of(required_columns)) %>% mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% summarize(geometry = st_union(geometry)) %>% ungroup() - # Function to create the intersection matrix and compute average traffic - calculate_traffic_averages <- function(shapefile_data) { - # Create an intersection matrix for averages - intersection_matrix <- st_intersects(shapefile_data) - - # Create intersecting pairs - intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { - data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) - })) %>% - filter(polygon_1 != polygon_2) # Remove self-matches - - # Add polygon ID and relevant columns to shapefile data. This allows us to later merge - # data with the intersection pairs above. - shapefile_with_ids <- shapefile_data %>% - mutate(polygon_id = row_number()) %>% - select(polygon_id, road_name, daily_traffic, speed_limit, lanes) - - # Join intersecting pairs with matching street IDs - averages <- intersecting_pairs %>% - left_join( - shapefile_with_ids %>% - # Create IDs for the "home" street - rename( - road_name_1 = road_name, - daily_traffic_1 = daily_traffic, - speed_limit_1 = speed_limit, - lanes_1 = lanes - ), - by = c("polygon_1" = "polygon_id") - ) %>% - left_join( - shapefile_with_ids %>% - # Create IDs for the neighboring streets - rename( - road_name_2 = road_name, - daily_traffic_2 = daily_traffic, - speed_limit_2 = speed_limit, - lanes_2 = lanes - ), - by = c("polygon_2" = "polygon_id") - ) %>% - filter(road_name_1 == road_name_2) %>% # Keep only matching road names - group_by(polygon_1) %>% - # Create averages - summarize( - average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), - average_speed_limit = mean(speed_limit_2, na.rm = TRUE), - average_lanes = mean(lanes_2, na.rm = TRUE), - .groups = 'drop' - ) - - # Update traffic, speed limit, and lanes with averages - shapefile_data <- shapefile_data %>% - mutate(polygon_id = row_number()) %>% - left_join(averages, by = c("polygon_id" = "polygon_1")) %>% - mutate( - daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), - speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), - lanes = if_else(is.na(lanes), average_lanes, lanes) - ) - - return(shapefile_data) - } - + # Function to compute traffic averages with a loop until no changes are made + calculate_traffic_data <- function(shapefile_data) { + # Helper function to calculate averages based on intersections + calculate_traffic_averages <- function(data) { + # Create an intersection matrix + intersection_matrix <- st_intersects(data) + + # Create intersecting pairs + intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { + data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) + })) %>% + filter(polygon_1 != polygon_2) # Remove self-matches + + # Add polygon IDs and relevant columns for merging + data_with_ids <- data %>% + mutate(polygon_id = row_number()) %>% + select(polygon_id, road_name, daily_traffic, speed_limit, lanes) + + # Join intersecting pairs with their respective polygon data + averages <- intersecting_pairs %>% + left_join( + data_with_ids %>% + rename( + road_name_1 = road_name, + daily_traffic_1 = daily_traffic, + speed_limit_1 = speed_limit, + lanes_1 = lanes + ), + by = c("polygon_1" = "polygon_id") + ) %>% + left_join( + data_with_ids %>% + rename( + road_name_2 = road_name, + daily_traffic_2 = daily_traffic, + speed_limit_2 = speed_limit, + lanes_2 = lanes + ), + by = c("polygon_2" = "polygon_id") + ) %>% + filter(road_name_1 == road_name_2) %>% # Keep only matching road names + group_by(polygon_1) %>% + summarize( + average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), + average_speed_limit = mean(speed_limit_2, na.rm = TRUE), + average_lanes = mean(lanes_2, na.rm = TRUE), + .groups = 'drop' + ) + + # Update the original data with averages where needed + updated_data <- data %>% + mutate(polygon_id = row_number()) %>% + left_join(averages, by = c("polygon_id" = "polygon_1")) %>% + mutate( + daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), + speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), + lanes = if_else(is.na(lanes), average_lanes, lanes) + ) + + return(updated_data) + } - # Loop until no changes are made - shapefile_data_final <- shapefile_data - calculate_traffic_with_loop <- function(shapefile_data) { - # Initialize final shapefile data + # Initialize loop shapefile_data_final <- shapefile_data - repeat { - # Save current values to compare changes + # Save current NA counts to compare changes + previous_na_traffic <- sum(is.na(shapefile_data_final$daily_traffic)) + previous_na_speed <- sum(is.na(shapefile_data_final$speed_limit)) + previous_na_lanes <- sum(is.na(shapefile_data_final$lanes)) + + # Save the current state to track changes previous_traffic <- shapefile_data_final$daily_traffic previous_speed <- shapefile_data_final$speed_limit previous_lanes <- shapefile_data_final$lanes - # Recalculate averages and update shapefile data - shapefile_data_final <- calculate_traffic_averages(shapefile_data_final) - - # Check if all values remain unchanged - if (all(previous_traffic == shapefile_data_final$daily_traffic, na.rm = TRUE) && - all(previous_speed == shapefile_data_final$speed_limit, na.rm = TRUE) && - all(previous_lanes == shapefile_data_final$num_lanes, na.rm = TRUE)) { - break # Exit loop if no changes were made + # Recalculate averages and update the data + shapefile_data_final <- calculate_traffic_averages(shapefile_data_final) %>% + select(-average_intersect_value) + + # Calculate current NA counts after updating + current_na_traffic <- sum(is.na(shapefile_data_final$daily_traffic)) + current_na_speed <- sum(is.na(shapefile_data_final$speed_limit)) + current_na_lanes <- sum(is.na(shapefile_data_final$lanes)) + + # Exit loop if no changes in NA counts are detected + if (current_na_traffic >= previous_na_traffic && + current_na_speed >= previous_na_speed && + current_na_lanes >= previous_na_lanes) { + cat("No reduction in NA counts detected. Exiting loop.\n") + break } } + return(shapefile_data_final) } - calculate_traffic_with_loop(shapefile_data) + # Run the function + calculate_traffic_data(shapefile_data) + output_path <- file.path(output_bucket, basename(file_key)) - # geoarrow::write_geoparquet(shapefile_data_final, output_path) + geoarrow::write_geoparquet(shapefile_data_final, output_path) print(paste(file_key, "cleaned and uploaded.")) } From fb86cc6944cec35784a2f96a1f6fc8691dbef266 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 21:00:46 +0000 Subject: [PATCH 51/74] Wrapup --- .../spatial/spatial-environment-traffic.R | 103 +++++++++--------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 087b0d496..ea090ac40 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -76,8 +76,7 @@ walk(parquet_files, \(file_key) { shapefile_data <- geoarrow::read_geoparquet_sf( file.path(AWS_S3_RAW_BUCKET, file_key) ) %>% - st_transform(4326) %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) + st_transform(4326) required_columns <- c( "FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", @@ -104,19 +103,33 @@ walk(parquet_files, \(file_key) { speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% - mutate(surface_type = road_codes[as.character(surface_type)], - speed_limit = as.numeric(speed_limit), - road_name = str_to_lower(road_name), # Convert to lowercase - road_name = gsub("[[:punct:]]", "", road_name)) %>% # Remove punctuation like . / etc. - select(-one_of(required_columns)) %>% - mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% - mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% + mutate( + surface_type = road_codes[as.character(surface_type)], + speed_limit = as.numeric(speed_limit), + road_name = str_to_lower(road_name), # Convert to lowercase + road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation like . / etc. + + # Replace full street name words with abbreviations + road_name = gsub("\\bavenue\\b", "ave", road_name), + road_name = gsub("\\bav\\b", "ave", road_name), + road_name = gsub("\\bstreet\\b", "st", road_name), + road_name = gsub("\\bcourt\\b", "ct", road_name), + road_name = gsub("\\broad\\b", "rd", road_name), + road_name = gsub("\\bdrive\\b", "dr", road_name), + road_name = gsub("\\bplace\\b", "pl", road_name), + road_name = gsub("\\blane\\b", "ln", road_name), + road_name = gsub("\\btrail\\b", "trl", road_name), + road_name = gsub("\\bparkway\\b", "pkwy", road_name), + road_name = gsub("\\bhighway\\b", "hwy", road_name), + road_name = gsub("\\bexpressway\\b", "expy", road_name) + ) %>% + select(-one_of(required_columns)) %>% # Drop unnecessary columns + mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% # Replace 0 and '0000' with NA + mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Replace 9999 with NA group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% - summarize(geometry = st_union(geometry)) %>% - ungroup() + summarize(geometry = st_union(geometry), .groups = "drop") %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) - # Function to compute traffic averages with a loop until no changes are made - calculate_traffic_data <- function(shapefile_data) { # Helper function to calculate averages based on intersections calculate_traffic_averages <- function(data) { # Create an intersection matrix @@ -165,57 +178,47 @@ walk(parquet_files, \(file_key) { ) # Update the original data with averages where needed - updated_data <- data %>% + shapefile_data_final <- data %>% mutate(polygon_id = row_number()) %>% left_join(averages, by = c("polygon_id" = "polygon_1")) %>% mutate( daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), lanes = if_else(is.na(lanes), average_lanes, lanes) - ) + ) %>% + select(-c(average_daily_traffic, average_speed_limit, average_lanes, polygon_id)) - return(updated_data) + return(shapefile_data_final) } - # Initialize loop - shapefile_data_final <- shapefile_data + # Run the function + # Initialize with placeholder to ensure the first iteration runs + previous_na_counts <- list( + daily_traffic_na = -1, # Placeholder different from any real NA count + speed_limit_na = -1 # Same here + ) + + # Loop until no changes in NA counts repeat { - # Save current NA counts to compare changes - previous_na_traffic <- sum(is.na(shapefile_data_final$daily_traffic)) - previous_na_speed <- sum(is.na(shapefile_data_final$speed_limit)) - previous_na_lanes <- sum(is.na(shapefile_data_final$lanes)) - - # Save the current state to track changes - previous_traffic <- shapefile_data_final$daily_traffic - previous_speed <- shapefile_data_final$speed_limit - previous_lanes <- shapefile_data_final$lanes - - # Recalculate averages and update the data - shapefile_data_final <- calculate_traffic_averages(shapefile_data_final) %>% - select(-average_intersect_value) - - # Calculate current NA counts after updating - current_na_traffic <- sum(is.na(shapefile_data_final$daily_traffic)) - current_na_speed <- sum(is.na(shapefile_data_final$speed_limit)) - current_na_lanes <- sum(is.na(shapefile_data_final$lanes)) - - # Exit loop if no changes in NA counts are detected - if (current_na_traffic >= previous_na_traffic && - current_na_speed >= previous_na_speed && - current_na_lanes >= previous_na_lanes) { - cat("No reduction in NA counts detected. Exiting loop.\n") + # Calculate current NA counts + current_na_counts <- list( + daily_traffic_na = sum(is.na(shapefile_data$daily_traffic)), + speed_limit_na = sum(is.na(shapefile_data$speed_limit)) + ) + + # Check if NA counts have changed + if (!identical(current_na_counts, previous_na_counts)) { + print("NA values have changed, recalculating traffic averages.") + shapefile_data <- calculate_traffic_averages(shapefile_data) + + # Update previous NA counts for the next iteration + previous_na_counts <- current_na_counts + } else { + print("No further NA changes detected, stopping recalculation.") break } } - - return(shapefile_data_final) - } - - # Run the function - calculate_traffic_data(shapefile_data) - - output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data_final, output_path) From 540efdc582f03ea808e0de6920e6726cdab1cf9a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 21:10:46 +0000 Subject: [PATCH 52/74] Wrapup --- .../spatial/spatial-environment-traffic.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index ea090ac40..5f57f7179 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -109,6 +109,9 @@ walk(parquet_files, \(file_key) { road_name = str_to_lower(road_name), # Convert to lowercase road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation like . / etc. + # Remove standalone directional indicators (N, S, E, W) + road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), + # Replace full street name words with abbreviations road_name = gsub("\\bavenue\\b", "ave", road_name), road_name = gsub("\\bav\\b", "ave", road_name), @@ -121,7 +124,10 @@ walk(parquet_files, \(file_key) { road_name = gsub("\\btrail\\b", "trl", road_name), road_name = gsub("\\bparkway\\b", "pkwy", road_name), road_name = gsub("\\bhighway\\b", "hwy", road_name), - road_name = gsub("\\bexpressway\\b", "expy", road_name) + road_name = gsub("\\bexpressway\\b", "expy", road_name), + + # Remove extra spaces that may result from replacements + road_name = str_trim(road_name) ) %>% select(-one_of(required_columns)) %>% # Drop unnecessary columns mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% # Replace 0 and '0000' with NA From 81cfdf66779f16e4918be1ccc9eda5bde0be58e2 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 17 Oct 2024 21:52:04 +0000 Subject: [PATCH 53/74] Linting --- .../spatial/spatial-environment-traffic.R | 40 ++-- .../spatial/spatial-environment-traffic.R | 177 +++++++++--------- 2 files changed, 111 insertions(+), 106 deletions(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 0891a53ba..6f72d9cd8 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -8,8 +8,10 @@ library(arrow) # Define S3 bucket and paths AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -output_bucket <- file.path(AWS_S3_RAW_BUCKET, - "spatial", "environment", "traffic") +output_bucket <- file.path( + AWS_S3_RAW_BUCKET, + "spatial", "environment", "traffic" +) current_year <- strftime(Sys.Date(), "%Y") # Get list of available files @@ -26,14 +28,14 @@ years <- map(2012:year(Sys.Date()), \(x) { # Function to process each year and upload shapefiles for # that specific year to S3 process_shapefiles_for_year <- map(years, \(x) { - remote_file_path <- file.path(output_bucket, paste0(x, ".parquet")) # Skip everything if file already exists if (!object_exists(remote_file_path)) { # Define the URL for the shapefile ZIP file, dynamically for each year url <- paste0( - "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip") + "https://apps1.dot.illinois.gov/gist2/gisdata/all", x, ".zip" + ) # Create a temporary file to store the downloaded ZIP temp_zip <- tempfile(fileext = ".zip") @@ -46,33 +48,39 @@ process_shapefiles_for_year <- map(years, \(x) { # Unzip the file into a temporary directory unzip(temp_zip, exdir = temp_dir) - message(paste("Shapefile for year", x, - "unzipped into temporary directory.")) + message(paste( + "Shapefile for year", x, + "unzipped into temporary directory." + )) # List files in the unzipped directory and look for the .shp files unzipped_files <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) - shp_file_for_year <- unzipped_files[grepl(paste0("HWY", - x), - unzipped_files, - ignore.case = TRUE) - & grepl("\\.shp$", unzipped_files)] + shp_file_for_year <- unzipped_files[grepl( + paste0( + "HWY", + x + ), + unzipped_files, + ignore.case = TRUE + ) & + grepl("\\.shp$", unzipped_files)] # Process only the shapefile that matches the current year if (length(shp_file_for_year) == 1) { # Read the shapefile into the environment using sf::st_read shapefile_data <- sf::st_read(shp_file_for_year) %>% # Add filter for Cook County. The name changes in different years. - filter(if ("COUNTY" %in% names(.)) - COUNTY == '016' else INV_CO == '016') %>% + filter(if ("COUNTY" %in% names(.)) { + COUNTY == "016" + } else { + INV_CO == "016" + }) %>% mutate(year = as.character(x)) # Save the shapefile as a GeoParquet file geoarrow::write_geoparquet(shapefile_data, remote_file_path) - } else { message(paste("No shapefile found for year", x, ".")) } - } - }) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 5f57f7179..b382a8a54 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -67,9 +67,7 @@ parquet_files <- get_bucket_df( # Loop through each parquet file and process it walk(parquet_files, \(file_key) { - if (!aws.s3::object_exists(file.path(AWS_S3_WAREHOUSE_BUCKET, file_key))) { - print(paste("Cleaning", file_key)) # Convert the S3 object into raw data and read using geoarrow @@ -106,7 +104,7 @@ walk(parquet_files, \(file_key) { mutate( surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), - road_name = str_to_lower(road_name), # Convert to lowercase + road_name = str_to_lower(road_name), # Convert to lowercase road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation like . / etc. # Remove standalone directional indicators (N, S, E, W) @@ -129,101 +127,101 @@ walk(parquet_files, \(file_key) { # Remove extra spaces that may result from replacements road_name = str_trim(road_name) ) %>% - select(-one_of(required_columns)) %>% # Drop unnecessary columns - mutate(across(-geometry, ~replace(., . %in% c(0, "0000"), NA))) %>% # Replace 0 and '0000' with NA - mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Replace 9999 with NA + select(-one_of(required_columns)) %>% # Drop unnecessary columns + mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% # Replace 0 and '0000' with NA + mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Replace 9999 with NA group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% summarize(geometry = st_union(geometry), .groups = "drop") %>% mutate(geometry_3435 = st_transform(geometry, 3435)) - # Helper function to calculate averages based on intersections - calculate_traffic_averages <- function(data) { - # Create an intersection matrix - intersection_matrix <- st_intersects(data) - - # Create intersecting pairs - intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { - data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) - })) %>% - filter(polygon_1 != polygon_2) # Remove self-matches - - # Add polygon IDs and relevant columns for merging - data_with_ids <- data %>% - mutate(polygon_id = row_number()) %>% - select(polygon_id, road_name, daily_traffic, speed_limit, lanes) - - # Join intersecting pairs with their respective polygon data - averages <- intersecting_pairs %>% - left_join( - data_with_ids %>% - rename( - road_name_1 = road_name, - daily_traffic_1 = daily_traffic, - speed_limit_1 = speed_limit, - lanes_1 = lanes - ), - by = c("polygon_1" = "polygon_id") - ) %>% - left_join( - data_with_ids %>% - rename( - road_name_2 = road_name, - daily_traffic_2 = daily_traffic, - speed_limit_2 = speed_limit, - lanes_2 = lanes - ), - by = c("polygon_2" = "polygon_id") - ) %>% - filter(road_name_1 == road_name_2) %>% # Keep only matching road names - group_by(polygon_1) %>% - summarize( - average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), - average_speed_limit = mean(speed_limit_2, na.rm = TRUE), - average_lanes = mean(lanes_2, na.rm = TRUE), - .groups = 'drop' - ) - - # Update the original data with averages where needed - shapefile_data_final <- data %>% - mutate(polygon_id = row_number()) %>% - left_join(averages, by = c("polygon_id" = "polygon_1")) %>% - mutate( - daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), - speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), - lanes = if_else(is.na(lanes), average_lanes, lanes) - ) %>% - select(-c(average_daily_traffic, average_speed_limit, average_lanes, polygon_id)) - - return(shapefile_data_final) - } + # Helper function to calculate averages based on intersections + calculate_traffic_averages <- function(data) { + # Create an intersection matrix + intersection_matrix <- st_intersects(data) + + # Create intersecting pairs + intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { + data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) + })) %>% + filter(polygon_1 != polygon_2) # Remove self-matches + + # Add polygon IDs and relevant columns for merging + data_with_ids <- data %>% + mutate(polygon_id = row_number()) %>% + select(polygon_id, road_name, daily_traffic, speed_limit, lanes) + + # Join intersecting pairs with their respective polygon data + averages <- intersecting_pairs %>% + left_join( + data_with_ids %>% + rename( + road_name_1 = road_name, + daily_traffic_1 = daily_traffic, + speed_limit_1 = speed_limit, + lanes_1 = lanes + ), + by = c("polygon_1" = "polygon_id") + ) %>% + left_join( + data_with_ids %>% + rename( + road_name_2 = road_name, + daily_traffic_2 = daily_traffic, + speed_limit_2 = speed_limit, + lanes_2 = lanes + ), + by = c("polygon_2" = "polygon_id") + ) %>% + filter(road_name_1 == road_name_2) %>% # Keep only matching road names + group_by(polygon_1) %>% + summarize( + average_daily_traffic = mean(daily_traffic_2, na.rm = TRUE), + average_speed_limit = mean(speed_limit_2, na.rm = TRUE), + average_lanes = mean(lanes_2, na.rm = TRUE), + .groups = "drop" + ) - # Run the function - # Initialize with placeholder to ensure the first iteration runs - previous_na_counts <- list( - daily_traffic_na = -1, # Placeholder different from any real NA count - speed_limit_na = -1 # Same here + # Update the original data with averages where needed + shapefile_data_final <- data %>% + mutate(polygon_id = row_number()) %>% + left_join(averages, by = c("polygon_id" = "polygon_1")) %>% + mutate( + daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), + speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), + lanes = if_else(is.na(lanes), average_lanes, lanes) + ) %>% + select(-c(average_daily_traffic, average_speed_limit, average_lanes, polygon_id)) + + return(shapefile_data_final) + } + + # Run the function + # Initialize with placeholder to ensure the first iteration runs + previous_na_counts <- list( + daily_traffic_na = -1, # Placeholder different from any real NA count + speed_limit_na = -1 # Same here + ) + + # Loop until no changes in NA counts + repeat { + # Calculate current NA counts + current_na_counts <- list( + daily_traffic_na = sum(is.na(shapefile_data$daily_traffic)), + speed_limit_na = sum(is.na(shapefile_data$speed_limit)) ) - # Loop until no changes in NA counts - repeat { - # Calculate current NA counts - current_na_counts <- list( - daily_traffic_na = sum(is.na(shapefile_data$daily_traffic)), - speed_limit_na = sum(is.na(shapefile_data$speed_limit)) - ) + # Check if NA counts have changed + if (!identical(current_na_counts, previous_na_counts)) { + print("NA values have changed, recalculating traffic averages.") + shapefile_data <- calculate_traffic_averages(shapefile_data) - # Check if NA counts have changed - if (!identical(current_na_counts, previous_na_counts)) { - print("NA values have changed, recalculating traffic averages.") - shapefile_data <- calculate_traffic_averages(shapefile_data) - - # Update previous NA counts for the next iteration - previous_na_counts <- current_na_counts - } else { - print("No further NA changes detected, stopping recalculation.") - break - } + # Update previous NA counts for the next iteration + previous_na_counts <- current_na_counts + } else { + print("No further NA changes detected, stopping recalculation.") + break } + } output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data_final, output_path) @@ -231,4 +229,3 @@ walk(parquet_files, \(file_key) { print(paste(file_key, "cleaned and uploaded.")) } }) - From 7f1e4c1f05da138479d812f964a3515b70bbed25 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 21:59:18 +0000 Subject: [PATCH 54/74] linting --- .../spatial/spatial-environment-traffic.R | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index b382a8a54..17bd258ad 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -25,16 +25,25 @@ road_codes <- c( "501" = "Over PCC - Rubblized - Reinforcement unknown", "510" = "Over PCC - Rubblized - No reinforcement", "520" = "Over PCC - Rubblized - Partial reinforcement", - "525" = "Over PCC - Rubblized - With No or Partial Reinforcement - But having Hinged Joints", + "525" = paste( + "Over PCC - Rubblized - With No or Partial Reinforcement -", + "But having Hinged Joints" + ), "530" = "Over PCC - Rubblized - Full reinforcement", "540" = "Over PCC - Rubblized - Continuous reinforcement", "550" = "Bituminous Concrete (other than Class I)", "560" = "Bituminous Concrete Pavement (Full-Depth)", "600" = "Over PCC - Reinforcement unknown", "610" = "Over PCC - No reinforcement", - "615" = "Over PCC - No reinforcement but having short panels and dowels", + "615" = paste( + "Over PCC - No reinforcement but having short panels", + "and dowels" + ), "620" = "Over PCC - Partial reinforcement", - "625" = "Over PCC - With No or Partial Reinforcement - But having Hinged Joints", + "625" = paste( + "Over PCC - With No or Partial Reinforcement -", + "But having Hinged Joints" + ), "630" = "Over PCC - Full reinforcement", "640" = "Over PCC - Continuous reinforcement", "650" = "Over Brick, Block, Steel, or similar material", @@ -50,8 +59,14 @@ road_codes <- c( "767" = "Reinforced over PCC - No reinforcement", "770" = "Non-Reinforced over PCC - Partial reinforcement", "772" = "Reinforced over PCC - Partial reinforcement", - "775" = "Non-Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", - "777" = "Reinforced over PCC - With No or Partial reinforcement but having Hinged Joints", + "775" = paste( + "Non-Reinforced over PCC - With No or Partial reinforcement", + "but having Hinged Joints" + ), + "777" = paste( + "Reinforced over PCC - With No or Partial reinforcement", + "but having Hinged Joints" + ), "780" = "Non-Reinforced over PCC - Full reinforcement", "782" = "Reinforced over PCC - Full reinforcement", "790" = "Non-Reinforced over PCC - Continuous reinforcement", From fa29c882b2d79a1ce7905e12e02620bafe3f4d84 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 17 Oct 2024 22:03:29 +0000 Subject: [PATCH 55/74] linting --- .../spatial/spatial-environment-traffic.R | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 17bd258ad..441785ab3 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -101,7 +101,8 @@ walk(parquet_files, \(file_key) { shapefile_data <- shapefile_data %>% select(all_of(existing_columns)) %>% mutate( - road_type = if ("FCNAME" %in% colnames(.)) FCNAME else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, + road_type = if ("FCNAME" %in% colnames(.)) FCNAME + else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, lanes = if ("LNS" %in% colnames(.)) LNS else NA, surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, @@ -112,7 +113,8 @@ walk(parquet_files, \(file_key) { condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, - distress_opposing = if ("DTRESS_OPP" %in% colnames(.)) DTRESS_OPP else NA, + distress_opposing = if ("DTRESS_OPP" %in% + colnames(.)) DTRESS_OPP else NA, speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% @@ -120,7 +122,7 @@ walk(parquet_files, \(file_key) { surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), road_name = str_to_lower(road_name), # Convert to lowercase - road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation like . / etc. + road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation # Remove standalone directional indicators (N, S, E, W) road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), @@ -143,8 +145,8 @@ walk(parquet_files, \(file_key) { road_name = str_trim(road_name) ) %>% select(-one_of(required_columns)) %>% # Drop unnecessary columns - mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% # Replace 0 and '0000' with NA - mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Replace 9999 with NA + mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% + mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% summarize(geometry = st_union(geometry), .groups = "drop") %>% mutate(geometry_3435 = st_transform(geometry, 3435)) @@ -155,11 +157,16 @@ walk(parquet_files, \(file_key) { intersection_matrix <- st_intersects(data) # Create intersecting pairs - intersecting_pairs <- do.call(rbind, lapply(seq_along(intersection_matrix), function(i) { - data.frame(polygon_1 = i, polygon_2 = intersection_matrix[[i]]) - })) %>% - filter(polygon_1 != polygon_2) # Remove self-matches - + intersecting_pairs <- do.call( + rbind, + lapply(seq_along(intersection_matrix), function(i) { + data.frame( + polygon_1 = i, + polygon_2 = intersection_matrix[[i]] + ) + }) + ) %>% + filter(polygon_1 != polygon_2) # Remove self-matches # Add polygon IDs and relevant columns for merging data_with_ids <- data %>% mutate(polygon_id = row_number()) %>% @@ -201,11 +208,14 @@ walk(parquet_files, \(file_key) { mutate(polygon_id = row_number()) %>% left_join(averages, by = c("polygon_id" = "polygon_1")) %>% mutate( - daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, daily_traffic), - speed_limit = if_else(is.na(speed_limit), average_speed_limit, speed_limit), + daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, + daily_traffic), + speed_limit = if_else(is.na(speed_limit), average_speed_limit, + speed_limit), lanes = if_else(is.na(lanes), average_lanes, lanes) ) %>% - select(-c(average_daily_traffic, average_speed_limit, average_lanes, polygon_id)) + select(-c(average_daily_traffic, average_speed_limit, + average_lanes, polygon_id)) return(shapefile_data_final) } From 5a285fde0524845c5835f324927b892c9c0ba5d1 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 21 Oct 2024 16:06:34 +0000 Subject: [PATCH 56/74] Rename to shapefile_data --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 441785ab3..48768e0c7 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -249,7 +249,7 @@ walk(parquet_files, \(file_key) { } output_path <- file.path(output_bucket, basename(file_key)) - geoarrow::write_geoparquet(shapefile_data_final, output_path) + geoarrow::write_geoparquet(shapefile_data, output_path) print(paste(file_key, "cleaned and uploaded.")) } From 6734b5d690ebaf8ac9037761a6afca9d5cec4b2b Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 21 Oct 2024 18:20:56 +0000 Subject: [PATCH 57/74] Add commented text --- .../spatial/spatial-environment-traffic.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 48768e0c7..8bdb21aa0 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -91,6 +91,8 @@ walk(parquet_files, \(file_key) { ) %>% st_transform(4326) + # Because column names change, we can't just select, but create an intersection + # of columns we want and the renamed columns. required_columns <- c( "FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", @@ -147,11 +149,15 @@ walk(parquet_files, \(file_key) { select(-one_of(required_columns)) %>% # Drop unnecessary columns mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% + # Group by the characteristics that we want group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% + # Create a union of the streets based on the summarized features summarize(geometry = st_union(geometry), .groups = "drop") %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) + mutate(geometry_3435 = st_transform(geometry, 3435)) %>% + ungroup() - # Helper function to calculate averages based on intersections + # Helper function to calculate averages based on intersections of streets with the + # same name and overlapping spatial features. calculate_traffic_averages <- function(data) { # Create an intersection matrix intersection_matrix <- st_intersects(data) From bc37c55723a0e65d6deac7a22aeb8d8d403bb537 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 21 Oct 2024 19:08:13 +0000 Subject: [PATCH 58/74] remove slash --- .../spatial/spatial-environment-traffic.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 8bdb21aa0..57238014b 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -1,13 +1,14 @@ library(aws.s3) library(dplyr) +library(geoarrow) library(purrr) library(sf) -library(geoarrow) +library(stringr) # Define the S3 bucket and folder path AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -s3_folder <- "spatial/environment/traffic/" +s3_folder <- "spatial/environment/traffic" output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) # Recoding of road type From 91fab65174bf9fdd03e08c0529967d0c65d1ff8e Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 21 Oct 2024 20:09:17 +0000 Subject: [PATCH 59/74] remove hash --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 57238014b..6007a55a6 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -9,7 +9,7 @@ library(stringr) AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic" -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder) +output_bucket <- sub("/$", "", file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder)) # Recoding of road type road_codes <- c( From 9eb0fe3748fcabbdaba96c53da67577649d83bbc Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 22 Oct 2024 18:48:54 +0000 Subject: [PATCH 60/74] Updates after viewing output --- .../spatial/spatial-environment-traffic.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 6007a55a6..a4171cb14 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -11,7 +11,7 @@ AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic" output_bucket <- sub("/$", "", file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder)) -# Recoding of road type +# Re-coding of road type road_codes <- c( "010" = "Unimproved", "020" = "Graded and Drained", @@ -124,10 +124,16 @@ walk(parquet_files, \(file_key) { mutate( surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), + # For testing + road_name_preserved = road_name, road_name = str_to_lower(road_name), # Convert to lowercase road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation # Remove standalone directional indicators (N, S, E, W) + # I wouldn't remove North South East west, so that streets like North + # Ave become empty. I also discovered that TH is not universally applied. + # For example, you can see 100TH st. I don't think the added value + # of removing TH is worth the risk of complicating valid street names. road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), # Replace full street name words with abbreviations @@ -147,11 +153,12 @@ walk(parquet_files, \(file_key) { # Remove extra spaces that may result from replacements road_name = str_trim(road_name) ) %>% - select(-one_of(required_columns)) %>% # Drop unnecessary columns + # Remove duplicated columns except for year + select(-one_of(required_columns[required_columns != "year"])) %>% mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Group by the characteristics that we want - group_by(road_name, speed_limit, lanes, surface_type, daily_traffic) %>% + group_by(road_name, speed_limit, lanes, surface_type, daily_traffic, year, road_type) %>% # Create a union of the streets based on the summarized features summarize(geometry = st_union(geometry), .groups = "drop") %>% mutate(geometry_3435 = st_transform(geometry, 3435)) %>% @@ -255,6 +262,9 @@ walk(parquet_files, \(file_key) { } } + shapefile_data <- shapefile_data %>% + mutate(across(-geometry, ~ ifelse(is.nan(.), NA, .))) + output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data, output_path) From 8237ace255ab977d3780d837b762c2102ef807fb Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 23 Oct 2024 16:20:58 +0000 Subject: [PATCH 61/74] Remove additional geom column from mutate --- .../spatial/spatial-environment-traffic.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index a4171cb14..beda69115 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -263,11 +263,12 @@ walk(parquet_files, \(file_key) { } shapefile_data <- shapefile_data %>% - mutate(across(-geometry, ~ ifelse(is.nan(.), NA, .))) + mutate(across(-c(geometry, geometry_3435), ~ ifelse(is.nan(.), NA, .))) output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data, output_path) print(paste(file_key, "cleaned and uploaded.")) } -}) +}, .progress = TRUE) + From c87c58c4fc1373bad36e31c73caa461c8fef17f3 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Fri, 1 Nov 2024 20:44:47 +0000 Subject: [PATCH 62/74] lintr --- .../spatial/spatial-environment-traffic.R | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index a4171cb14..725e77c04 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -92,8 +92,9 @@ walk(parquet_files, \(file_key) { ) %>% st_transform(4326) - # Because column names change, we can't just select, but create an intersection - # of columns we want and the renamed columns. + # Because column names change, we can't just select, + # but create an intersection of columns we want + # and the renamed columns. required_columns <- c( "FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", @@ -131,8 +132,10 @@ walk(parquet_files, \(file_key) { # Remove standalone directional indicators (N, S, E, W) # I wouldn't remove North South East west, so that streets like North - # Ave become empty. I also discovered that TH is not universally applied. - # For example, you can see 100TH st. I don't think the added value + # Ave become empty. I also discovered that + # TH is not universally applied. + # For example, you can look at 100TH st. + # I don't think the added value # of removing TH is worth the risk of complicating valid street names. road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), @@ -158,14 +161,16 @@ walk(parquet_files, \(file_key) { mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Group by the characteristics that we want - group_by(road_name, speed_limit, lanes, surface_type, daily_traffic, year, road_type) %>% + group_by(road_name, speed_limit, lanes, + surface_type, daily_traffic, year, road_type) %>% # Create a union of the streets based on the summarized features summarize(geometry = st_union(geometry), .groups = "drop") %>% mutate(geometry_3435 = st_transform(geometry, 3435)) %>% ungroup() - # Helper function to calculate averages based on intersections of streets with the - # same name and overlapping spatial features. + # Helper function to calculate averages based on intersections + # of streets with the same name + # and overlapping spatial features. calculate_traffic_averages <- function(data) { # Create an intersection matrix intersection_matrix <- st_intersects(data) From 9f067ee9896364e5cbfdd3096857531340e2f09a Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 3 Nov 2024 23:48:48 +0000 Subject: [PATCH 63/74] Linting --- .../spatial/spatial-environment-traffic.R | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index b8e7e52ca..854e6bd91 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -105,8 +105,9 @@ walk(parquet_files, \(file_key) { shapefile_data <- shapefile_data %>% select(all_of(existing_columns)) %>% mutate( - road_type = if ("FCNAME" %in% colnames(.)) FCNAME - else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, + road_type = if ("FCNAME" %in% colnames(.)) { + FCNAME + } else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, lanes = if ("LNS" %in% colnames(.)) LNS else NA, surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, @@ -118,7 +119,11 @@ walk(parquet_files, \(file_key) { road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, distress_opposing = if ("DTRESS_OPP" %in% - colnames(.)) DTRESS_OPP else NA, + colnames(.)) { + DTRESS_OPP + } else { + NA + }, speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA ) %>% @@ -161,8 +166,10 @@ walk(parquet_files, \(file_key) { mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Group by the characteristics that we want - group_by(road_name, speed_limit, lanes, - surface_type, daily_traffic, year, road_type) %>% + group_by( + road_name, speed_limit, lanes, + surface_type, daily_traffic, year, road_type + ) %>% # Create a union of the streets based on the summarized features summarize(geometry = st_union(geometry), .groups = "drop") %>% mutate(geometry_3435 = st_transform(geometry, 3435)) %>% @@ -185,7 +192,7 @@ walk(parquet_files, \(file_key) { ) }) ) %>% - filter(polygon_1 != polygon_2) # Remove self-matches + filter(polygon_1 != polygon_2) # Remove self-matches # Add polygon IDs and relevant columns for merging data_with_ids <- data %>% mutate(polygon_id = row_number()) %>% @@ -228,13 +235,17 @@ walk(parquet_files, \(file_key) { left_join(averages, by = c("polygon_id" = "polygon_1")) %>% mutate( daily_traffic = if_else(is.na(daily_traffic), average_daily_traffic, - daily_traffic), + daily_traffic + ), speed_limit = if_else(is.na(speed_limit), average_speed_limit, - speed_limit), + speed_limit + ), lanes = if_else(is.na(lanes), average_lanes, lanes) ) %>% - select(-c(average_daily_traffic, average_speed_limit, - average_lanes, polygon_id)) + select(-c( + average_daily_traffic, average_speed_limit, + average_lanes, polygon_id + )) return(shapefile_data_final) } @@ -276,4 +287,3 @@ walk(parquet_files, \(file_key) { print(paste(file_key, "cleaned and uploaded.")) } }, .progress = TRUE) - From 2fb559a916c34cfc4670b67448fb06627c003026 Mon Sep 17 00:00:00 2001 From: Damonamajor <56321109+Damonamajor@users.noreply.github.com> Date: Mon, 4 Nov 2024 08:59:49 -0600 Subject: [PATCH 64/74] Update dbt/models/spatial/docs.md Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> --- dbt/models/spatial/docs.md | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index c4fa435f2..86271fdae 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -500,7 +500,6 @@ Includes townships within the City of Chicago, which are technically defunct. Illinois Department of Transportation data source from [https://apps1.dot.illinois.gov/gist2/](https://apps1.dot.illinois.gov/gist2/). - **Geometry:** `MULTILINESTRING` {% enddocs %} From fd1945bf59ae4d8e5a10c73920b61a92b024cebf Mon Sep 17 00:00:00 2001 From: Damonamajor <56321109+Damonamajor@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:00:04 -0600 Subject: [PATCH 65/74] Update etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> --- .../spatial/spatial-environment-traffic.R | 1 - 1 file changed, 1 deletion(-) diff --git a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R index 6f72d9cd8..aff491495 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-raw-us-east-1/spatial/spatial-environment-traffic.R @@ -12,7 +12,6 @@ output_bucket <- file.path( AWS_S3_RAW_BUCKET, "spatial", "environment", "traffic" ) -current_year <- strftime(Sys.Date(), "%Y") # Get list of available files years <- map(2012:year(Sys.Date()), \(x) { From bc045b002e134bb6b60cc5500caf1e8167bc867f Mon Sep 17 00:00:00 2001 From: Damonamajor <56321109+Damonamajor@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:00:33 -0600 Subject: [PATCH 66/74] Update etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> --- .../spatial/spatial-environment-traffic.R | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 854e6bd91..3b91bd3bf 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -145,18 +145,22 @@ walk(parquet_files, \(file_key) { road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), # Replace full street name words with abbreviations - road_name = gsub("\\bavenue\\b", "ave", road_name), - road_name = gsub("\\bav\\b", "ave", road_name), - road_name = gsub("\\bstreet\\b", "st", road_name), - road_name = gsub("\\bcourt\\b", "ct", road_name), - road_name = gsub("\\broad\\b", "rd", road_name), - road_name = gsub("\\bdrive\\b", "dr", road_name), - road_name = gsub("\\bplace\\b", "pl", road_name), - road_name = gsub("\\blane\\b", "ln", road_name), - road_name = gsub("\\btrail\\b", "trl", road_name), - road_name = gsub("\\bparkway\\b", "pkwy", road_name), - road_name = gsub("\\bhighway\\b", "hwy", road_name), - road_name = gsub("\\bexpressway\\b", "expy", road_name), +road_name = str_replace_all( + road_name, + c("\\bavenue\\b" = "ave", + "\\bav\\b" = "ave", + "\\bstreet\\b" = "st", + "\\bcourt\\b" = "ct", + "\\broad\\b" = "rd", + "\\bdrive\\b" = "dr", + "\\bplace\\b" = "pl", + "\\blane\\b" = "ln", + "\\btrail\\b" = "trl", + "\\bparkway\\b" = "pkwy", + "\\bhighway\\b" = "hwy", + "\\bexpressway\\b" = "expy" + ) + ), # Remove extra spaces that may result from replacements road_name = str_trim(road_name) From 9810b7e3b11209292e5c9dde9d976c6f4d64e55a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 16:00:46 +0000 Subject: [PATCH 67/74] Billy changes --- .../spatial/spatial-environment-traffic.R | 58 +++++++++---------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 3b91bd3bf..0c4aeb62c 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -6,7 +6,7 @@ library(sf) library(stringr) # Define the S3 bucket and folder path -AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +AWS_S3_RAW_BUCKET <- "s3://ccao-data-raw-us-east-1" AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic" output_bucket <- sub("/$", "", file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder)) @@ -101,40 +101,36 @@ walk(parquet_files, \(file_key) { "SP_LIM", "INVENTORY", "geometry_3435", "year" ) + # Define the renaming mapping + renames <- c( + "FCNAME" = "road_type", + "FC_NAME" = "road_type", + "LNS" = "lanes", + "SURF_TYP" = "surface_type", + "SURF_WTH" = "surface_width", + "SURF_YR" = "surface_year", + "AADT" = "daily_traffic", + "CRS_WITH" = "condition_with", + "CRS_OPP" = "condition_opposing", + "CRS_YR" = "condition_year", + "ROAD_NAME" = "road_name", + "DTRESS_WTH" = "distress_with", + "DTRESS_OPP" = "distress_opposing" + ) + + existing_columns <- intersect(required_columns, colnames(shapefile_data)) shapefile_data <- shapefile_data %>% select(all_of(existing_columns)) %>% - mutate( - road_type = if ("FCNAME" %in% colnames(.)) { - FCNAME - } else if ("FC_NAME" %in% colnames(.)) FC_NAME else NA, - lanes = if ("LNS" %in% colnames(.)) LNS else NA, - surface_type = if ("SURF_TYP" %in% colnames(.)) SURF_TYP else NA, - surface_width = if ("SURF_WTH" %in% colnames(.)) SURF_WTH else NA, - surface_year = if ("SURF_YR" %in% colnames(.)) SURF_YR else NA, - daily_traffic = if ("AADT" %in% colnames(.)) AADT else NA, - condition_with = if ("CRS_WITH" %in% colnames(.)) CRS_WITH else NA, - condition_opposing = if ("CRS_OPP" %in% colnames(.)) CRS_OPP else NA, - condition_year = if ("CRS_YR" %in% colnames(.)) CRS_YR else NA, - road_name = if ("ROAD_NAME" %in% colnames(.)) ROAD_NAME else NA, - distress_with = if ("DTRESS_WTH" %in% colnames(.)) DTRESS_WTH else NA, - distress_opposing = if ("DTRESS_OPP" %in% - colnames(.)) { - DTRESS_OPP - } else { - NA - }, - speed_limit = if ("SP_LIM" %in% colnames(.)) SP_LIM else NA, - inventory_id = if ("INVENTORY" %in% colnames(.)) INVENTORY else NA - ) %>% - mutate( + # Dynamically rename and select columns based on the existing names in the dataset + rename_with(~ renames[.x], .cols = intersect(names(.), names(renames))) %>% + mutate(across(all_of(renames), ~ ifelse(is.na(.), NA, .)), surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), # For testing road_name_preserved = road_name, road_name = str_to_lower(road_name), # Convert to lowercase road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation - # Remove standalone directional indicators (N, S, E, W) # I wouldn't remove North South East west, so that streets like North # Ave become empty. I also discovered that @@ -256,13 +252,14 @@ road_name = str_replace_all( # Run the function # Initialize with placeholder to ensure the first iteration runs + # Initialize previous NA counts with values that differ from any real NA count previous_na_counts <- list( - daily_traffic_na = -1, # Placeholder different from any real NA count - speed_limit_na = -1 # Same here + daily_traffic_na = -1, + speed_limit_na = -1 ) - # Loop until no changes in NA counts - repeat { + # Loop until there are no changes in NA counts + while (TRUE) { # Calculate current NA counts current_na_counts <- list( daily_traffic_na = sum(is.na(shapefile_data$daily_traffic)), @@ -282,6 +279,7 @@ road_name = str_replace_all( } } + shapefile_data <- shapefile_data %>% mutate(across(-c(geometry, geometry_3435), ~ ifelse(is.nan(.), NA, .))) From de0526cd5382d1410705b2cf61dae01589684f83 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 18:13:59 +0000 Subject: [PATCH 68/74] Working file with doc updates --- dbt/models/spatial/docs.md | 6 +++ .../spatial/spatial-environment-traffic.R | 44 ++++++++++--------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index 86271fdae..95bda9c5c 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -499,6 +499,12 @@ Includes townships within the City of Chicago, which are technically defunct. Illinois Department of Transportation data source from [https://apps1.dot.illinois.gov/gist2/](https://apps1.dot.illinois.gov/gist2/). +Data focuses on five features; lanes, speed limits, traffic count, road type, +and surface type. Some columns are not present in all years of data (for example +speed limit in 2012) Data for columns is not universally present so we average +numeric values for roads which overlap and have a matching name. For example, +if segment B touches segment A and C with speed limits of 25 and 30, the speed +limit for segment B will be 27.5. **Geometry:** `MULTILINESTRING` {% enddocs %} diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 0c4aeb62c..b39ac26f4 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -101,7 +101,6 @@ walk(parquet_files, \(file_key) { "SP_LIM", "INVENTORY", "geometry_3435", "year" ) - # Define the renaming mapping renames <- c( "FCNAME" = "road_type", "FC_NAME" = "road_type", @@ -110,6 +109,7 @@ walk(parquet_files, \(file_key) { "SURF_WTH" = "surface_width", "SURF_YR" = "surface_year", "AADT" = "daily_traffic", + "SP_LIM" = "speed_limit", "CRS_WITH" = "condition_with", "CRS_OPP" = "condition_opposing", "CRS_YR" = "condition_year", @@ -118,13 +118,19 @@ walk(parquet_files, \(file_key) { "DTRESS_OPP" = "distress_opposing" ) + shapefile_data <- shapefile_data %>% + rename_with(~ renames[.x], .cols = intersect(names(.), names(renames))) + + # Create a list of required columns based on the rename mappings + required_columns <- unique(unname(renames)) + + # Identify missing renamed columns and add them with NA values + missing_columns <- setdiff(required_columns, colnames(shapefile_data)) + shapefile_data[missing_columns] <- NA + - existing_columns <- intersect(required_columns, colnames(shapefile_data)) shapefile_data <- shapefile_data %>% - select(all_of(existing_columns)) %>% - # Dynamically rename and select columns based on the existing names in the dataset - rename_with(~ renames[.x], .cols = intersect(names(.), names(renames))) %>% - mutate(across(all_of(renames), ~ ifelse(is.na(.), NA, .)), + mutate( surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), # For testing @@ -132,16 +138,16 @@ walk(parquet_files, \(file_key) { road_name = str_to_lower(road_name), # Convert to lowercase road_name = gsub("[[:punct:]]", "", road_name), # Remove punctuation # Remove standalone directional indicators (N, S, E, W) - # I wouldn't remove North South East west, so that streets like North + # I wouldn't remove North South East West, so that streets like North # Ave become empty. I also discovered that - # TH is not universally applied. - # For example, you can look at 100TH st. + # TH is not universally applied. An example is 100th St. # I don't think the added value # of removing TH is worth the risk of complicating valid street names. + # Once again, ending in th would change North Ave. road_name = gsub("\\b(n|s|e|w)\\b", "", road_name), # Replace full street name words with abbreviations -road_name = str_replace_all( + road_name = str_replace_all( road_name, c("\\bavenue\\b" = "ave", "\\bav\\b" = "ave", @@ -161,8 +167,6 @@ road_name = str_replace_all( # Remove extra spaces that may result from replacements road_name = str_trim(road_name) ) %>% - # Remove duplicated columns except for year - select(-one_of(required_columns[required_columns != "year"])) %>% mutate(across(-geometry, ~ replace(., . %in% c(0, "0000"), NA))) %>% mutate(surface_year = ifelse(surface_year == 9999, NA, surface_year)) %>% # Group by the characteristics that we want @@ -183,15 +187,13 @@ road_name = str_replace_all( intersection_matrix <- st_intersects(data) # Create intersecting pairs - intersecting_pairs <- do.call( - rbind, - lapply(seq_along(intersection_matrix), function(i) { - data.frame( - polygon_1 = i, - polygon_2 = intersection_matrix[[i]] - ) - }) - ) %>% + intersecting_pairs <- map(seq_along(intersection_matrix), \(x) { + data.frame( + polygon_1 = x, + polygon_2 = intersection_matrix[[x]] + ) + }) %>% + bind_rows() %>% filter(polygon_1 != polygon_2) # Remove self-matches # Add polygon IDs and relevant columns for merging data_with_ids <- data %>% From a1b8691741866c53ff1a8c200f2fba4052e85a29 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:09:21 +0000 Subject: [PATCH 69/74] Final? --- .../spatial/spatial-environment-traffic.R | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index b39ac26f4..78aa41795 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -96,9 +96,10 @@ walk(parquet_files, \(file_key) { # but create an intersection of columns we want # and the renamed columns. required_columns <- c( - "FCNAME", "FC_NAME", "LNS", "SURF_TYP", "SURF_WTH", "SURF_YR", "AADT", - "CRS_WITH", "CRS_OPP", "CRS_YR", "ROAD_NAME", "DTRESS_WTH", "DTRESS_OPP", - "SP_LIM", "INVENTORY", "geometry_3435", "year" + "road_type", "lanes", "surface_type", "surface_width", "surface_year", + "daily_traffic", "speed_limit", "condition_with", "condition_opposing", + "condition_year", "road_name", "distress_with", "distress_opposing", + "inventory", "geometry_3435", "year" ) renames <- c( @@ -119,17 +120,15 @@ walk(parquet_files, \(file_key) { ) shapefile_data <- shapefile_data %>% - rename_with(~ renames[.x], .cols = intersect(names(.), names(renames))) + rename_with(~ str_replace_all(.x, renames)) - # Create a list of required columns based on the rename mappings - required_columns <- unique(unname(renames)) + missing_columns <- setdiff(required_columns, names(shapefile_data)) - # Identify missing renamed columns and add them with NA values - missing_columns <- setdiff(required_columns, colnames(shapefile_data)) + # Add missing columns with NA values directly shapefile_data[missing_columns] <- NA - shapefile_data <- shapefile_data %>% + select(all_of(required_columns)) %>% mutate( surface_type = road_codes[as.character(surface_type)], speed_limit = as.numeric(speed_limit), @@ -283,7 +282,8 @@ walk(parquet_files, \(file_key) { shapefile_data <- shapefile_data %>% - mutate(across(-c(geometry, geometry_3435), ~ ifelse(is.nan(.), NA, .))) + mutate(across(-c(geometry, geometry_3435), ~ ifelse(is.nan(.), NA, .))) %>% + relocate(year, .after = last_col()) output_path <- file.path(output_bucket, basename(file_key)) geoarrow::write_geoparquet(shapefile_data, output_path) From 02bbdccfd2196bcb66ea658e84675ea7357a293d Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:10:26 +0000 Subject: [PATCH 70/74] Remove line at end --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 78aa41795..3f6f601a3 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -290,4 +290,4 @@ walk(parquet_files, \(file_key) { print(paste(file_key, "cleaned and uploaded.")) } -}, .progress = TRUE) +} From bc72c99b9625dce265a368f858ac4c35b140a37a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:12:05 +0000 Subject: [PATCH 71/74] Rename environ --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 3f6f601a3..35154ab12 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -6,7 +6,7 @@ library(sf) library(stringr) # Define the S3 bucket and folder path -AWS_S3_RAW_BUCKET <- "s3://ccao-data-raw-us-east-1" +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") s3_folder <- "spatial/environment/traffic" output_bucket <- sub("/$", "", file.path(AWS_S3_WAREHOUSE_BUCKET, s3_folder)) From 87da63bb6b3a747bd9fe42ae5ab51e06539b4bfe Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:18:51 +0000 Subject: [PATCH 72/74] Add ) --- .../spatial/spatial-environment-traffic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 35154ab12..b7b4894c1 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -290,4 +290,4 @@ walk(parquet_files, \(file_key) { print(paste(file_key, "cleaned and uploaded.")) } -} +}) From a5df33168fb1dfa2c66076a83d6a2e284ad8c4fa Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:21:01 +0000 Subject: [PATCH 73/74] lintr --- .../spatial/spatial-environment-traffic.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index b7b4894c1..76799259f 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -253,7 +253,8 @@ walk(parquet_files, \(file_key) { # Run the function # Initialize with placeholder to ensure the first iteration runs - # Initialize previous NA counts with values that differ from any real NA count + # Initialize previous NA counts with values that + # differ from any real NA count previous_na_counts <- list( daily_traffic_na = -1, speed_limit_na = -1 @@ -282,7 +283,8 @@ walk(parquet_files, \(file_key) { shapefile_data <- shapefile_data %>% - mutate(across(-c(geometry, geometry_3435), ~ ifelse(is.nan(.), NA, .))) %>% + mutate(across(-c(geometry, geometry_3435), + ~ ifelse(is.nan(.), NA, .))) %>% relocate(year, .after = last_col()) output_path <- file.path(output_bucket, basename(file_key)) From 5c60c23c7ea360068d446545a48146cbebe7a03a Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 4 Nov 2024 19:30:28 +0000 Subject: [PATCH 74/74] styler --- .../spatial/spatial-environment-traffic.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R index 76799259f..d98ed2c8a 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/spatial/spatial-environment-traffic.R @@ -148,7 +148,8 @@ walk(parquet_files, \(file_key) { # Replace full street name words with abbreviations road_name = str_replace_all( road_name, - c("\\bavenue\\b" = "ave", + c( + "\\bavenue\\b" = "ave", "\\bav\\b" = "ave", "\\bstreet\\b" = "st", "\\bcourt\\b" = "ct", @@ -283,8 +284,10 @@ walk(parquet_files, \(file_key) { shapefile_data <- shapefile_data %>% - mutate(across(-c(geometry, geometry_3435), - ~ ifelse(is.nan(.), NA, .))) %>% + mutate(across( + -c(geometry, geometry_3435), + ~ ifelse(is.nan(.), NA, .) + )) %>% relocate(year, .after = last_col()) output_path <- file.path(output_bucket, basename(file_key))