Skip to content

Commit

Permalink
i #284 Updated Notebook, exec/mailinglist.R and R/mail.R
Browse files Browse the repository at this point in the history
- Update pkgdown.yml
- Set eval to False for notebook
- Added warning for failed downloads
- Added check for missing months in the date range within save_folder_path
- Changed mbox_path in parsers to mbox_file_path
- Use gt package to view tables
- Made changes so Knit works for download_mail.Rmd
- Updated exec/mailinglist.R to use new functions
- To do: Use getter functions once they are merged

Signed-off-by: Dao McGill <dmcgill@hawaii.edu>
  • Loading branch information
daomcgill committed Oct 6, 2024
1 parent aa60648 commit 64e0646
Show file tree
Hide file tree
Showing 35 changed files with 306 additions and 170 deletions.
110 changes: 104 additions & 6 deletions R/mail.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#'
#' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox.
#' The function only downloads files that fall between the specified start_year_month and end_year_month.
#' When both formats fail to download, the function issues a warning indicating the missing month.
#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months.
#'
#' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/"
#' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM'
Expand Down Expand Up @@ -84,6 +86,9 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
}
}

########## Initialize Vector for Failed Months ##########
failed_months <- character()

########## Use Links to Download Individual Files ##########
# Initialize a vector for storing the paths of the downloaded files.
downloaded_files <- c()
Expand Down Expand Up @@ -117,6 +122,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
response <- httr::GET(download_url, httr::timeout(60))
if (httr::status_code(response) != 200) {
warning("Both .txt and .gz downloads failed for link: ", link, "\n")
failed_months <- c(failed_months, year_month_clean)
next
}
}
Expand Down Expand Up @@ -159,6 +165,48 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
downloaded_files <- c(downloaded_files, dest)
}

########## Summary of Downloads ##########
if (length(failed_months) > 0) {
warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", "))
}
# List the files in the save_folder_path.
downloaded_files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE)

# Extract the YYYYMM from the file names.
downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder))

# Create the expected list of YYYYMM between start_year_month and end_year_month.
start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d")
end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d")
all_dates <- seq(start_date, end_date, by = "month")
expected_dates <- as.numeric(format(all_dates, "%Y%m"))

# Identify missing months.
missing_months <- setdiff(expected_dates, downloaded_dates)

# Determine the earliest and latest dates downloaded.
if (length(downloaded_dates) > 0) {
min_downloaded_date <- min(downloaded_dates)
max_downloaded_date <- max(downloaded_dates)

if (verbose) {
cat("\nSummary of Downloads:\n")
cat("save_folder_path contains mail from date ", min_downloaded_date, " to ", max_downloaded_date, "\n")
}
} else {
if (verbose) {
cat("No files found in save_folder_path\n")
}
}

if (length(missing_months) == 0) {
if (verbose) {
cat("No missing months\n")
}
} else {
warning("Months missing in the date range: ", paste(missing_months, collapse = ", "), "\n")
}

########## Return List of Downloaded Files ##########
# Return the list of downloaded .mbox files
return(downloaded_files)
Expand Down Expand Up @@ -307,7 +355,9 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) {
#' of kaiaulu_YYYYMM.mbox.
#'
#' The function loops through each month in the range specified by `start_year_month` and `end_year_month`,
#' and constructs the appropriate URL to download each month's data. If any download fails, an error message is printed.
#' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months.
#' This means the file could not be found and that month's data may not exist.
#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months.
#'
#' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded
#' (e.g., "https://lists.apache.org/list.html?announce@apache.org").
Expand Down Expand Up @@ -336,6 +386,10 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa
end_year <- as.numeric(substr(end_year_month, 1, 4))
end_month <- as.numeric(substr(end_year_month, 5, 6))

########## Initialize Vectors for Failed Months ##########
# Vectors to track failed downloads.
failed_months <- character()

########## Download Loop ##########
# Iterate over the years and months from start_year/month to end_year/month.
# This is done by looping over the years, and for each year, looping over the 12 months.
Expand Down Expand Up @@ -380,10 +434,54 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa
}
# Remove failed download file.
unlink(file_path)
failed_months <- c(failed_months, year_month_str)
}
}
}

########## Summary of Failed Downloads ##########
if (length(failed_months) > 0) {
warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", "))
}

# List the files in the save_folder_path
downloaded_files <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE)

# Extract the YYYYMM from the file names
downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files))

# Find the expected list of YYYYMM between start_year_month and end_year_month
start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d")
end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d")
all_dates <- seq(start_date, end_date, by = "month")
expected_dates <- as.numeric(format(all_dates, "%Y%m"))

# Identify missing months
missing_months <- setdiff(expected_dates, downloaded_dates)

# Determine the earliest and latest dates downloaded
if (length(downloaded_dates) > 0) {
min_downloaded_date <- min(downloaded_dates)
max_downloaded_date <- max(downloaded_dates)

if (verbose) {
cat("\nSummary of Downloads:\n")
cat("save_folder_path contains mail from date", min_downloaded_date, "to", max_downloaded_date, "\n")
}
} else {
if (verbose) {
cat("No files found in save_folder_path\n")
}
}

if (length(missing_months) == 0) {
if (verbose) {
cat("No missing months\n")
}
} else {
warning("Months missing in the date range:", paste(missing_months, collapse = ", "), "\n")
}

########## Return Save Path ##########
# Return the folder path where all mbox files were saved.
return(save_folder_path)
Expand Down Expand Up @@ -465,19 +563,19 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v
#' consistently renamed for clarity.
#'
#' @param perceval_path path to perceval binary
#' @param mbox_path path to mbox archive file (ends in .mbox)
#' @param mbox_file_path path to mbox archive file (ends in .mbox)
#' @export
#' @family parsers
parse_mbox <- function(perceval_path, mbox_path){
parse_mbox <- function(perceval_path, mbox_file_path){
# Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop")
perceval_path <- path.expand(perceval_path)
mbox_path <- path.expand(mbox_path)
mbox_file_path <- path.expand(mbox_file_path)
# Remove ".mbox"
mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "")
mbox_uri <- stringi::stri_replace_last_regex(mbox_file_path, pattern = "\\.mbox$", replacement = "")

# Use percerval to parse mbox. --json line is required to be parsed by jsonlite::fromJSON.
perceval_output <- system2(perceval_path,
args = c('mbox',mbox_uri,mbox_path,'--json-line'),
args = c('mbox',mbox_uri,mbox_file_path,'--json-line'),
stdout = TRUE,
stderr = FALSE)

Expand Down
8 changes: 5 additions & 3 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,17 @@ reference:
- title: __Mail__
desc: >
Download, parsing and data generation of mailing lists.
See the [Download Mbox](../articles/download_mod_mbox.html)
See the [Download Mail](../articles/download_mail.html)
and [Reply](../articles/reply_communication_showcase.html)
Notebooks for details.
- contents:
- download_pipermail
- convert_pipermail_to_mbox
- refresh_pipermail
- download_mod_mbox
- download_mod_mbox_per_month
- refresh_mod_mbox
- process_gz_to_mbox_in_folder
- parse_mbox
- parse_mbox_latest_date
- make_mbox_reply
- make_mbox_mailing_list
- title: __JIRA__
Expand Down
8 changes: 4 additions & 4 deletions conf/helix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,29 +55,29 @@ mailing_list:
end_year_month: 202405
save_folder_path: "../../extdata/save_mbox_mail"
# mbox_path is for use only with parse_mbox() function. It is the file to parse.
mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox"
mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox"
project_key_2:
mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org
start_year_month: 202201
end_year_month: 202401
save_folder_path: "../../extdata/save_mbox_mail"
# mbox_path is for use only with parse_mbox() function. It is the file to parse.
mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox"
mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox"
pipermail:
project_key_1:
mailing_list: https://mta.openssl.org/pipermail/openssl-users/
start_year_month: 202310
end_year_month: 202405
save_folder_path: "../../extdata/save_folder_mail"
# mbox_path is for use only with parse_mbox() function. It is the file to parse.
mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox"
mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox"
project_key_2:
mailing_list: https://mta.openssl.org/pipermail/openssl-project/
start_year_month: 202203
end_year_month: 202303
save_folder_path: "../../extdata/save_folder_mail_2"
# mbox_path is for use only with parse_mbox() function. It is the file to parse.
mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox"
mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox"

issue_tracker:
jira:
Expand Down
116 changes: 59 additions & 57 deletions exec/mailinglist.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,38 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

require(yaml,quietly=TRUE)
require(cli,quietly=TRUE)
require(docopt,quietly=TRUE)
require(kaiaulu,quietly=TRUE)
require(data.table,quietly=TRUE)


require(yaml, quietly = TRUE)
require(cli, quietly = TRUE)
require(docopt, quietly = TRUE)
require(kaiaulu, quietly = TRUE)
require(data.table, quietly = TRUE)

doc <- "
USAGE:
mailinglist.R tabulate help
mailinglist.R tabulate <tools.yml> <project_conf.yml> <save_file_name_path>
mailinglist.R download modmbox help
mailinglist.R download modmbox <project_conf.yml> <start_year> <end_year> <save_file_name_path>
mailinglist.R download modmboxmonth help
mailinglist.R download modmboxmonth <project_conf.yml> <start_year> <end_year> <save_folder_name_path>
mailinglist.R download modmbox <project_conf.yml> <start_year_month> <end_year_month> <save_folder_path>
mailinglist.R download pipermail help
mailinglist.R download pipermail <project_conf.yml> <start_year_month> <end_year_month> <save_folder_path>
mailinglist.R (-h | --help)
mailinglist.R --version
DESCRIPTION:
Provides a suite of functions to interact with Mailing Lists. Please see
Kaiaulu's README.md for instructions on how to create <tool.yml>
Kaiaulu's README.md for instructions on how to create <tools.yml>
and <project_conf.yml>.
OPTIONS:
-h --help Show this screen.
--version Show version.
"



arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9600')
if(arguments[["tabulate"]] & arguments[["help"]]){
cli_alert_info("Tabulates a mailing list using parse_mbox().")
}else if(arguments[["tabulate"]]){

if (arguments[["tabulate"]] & arguments[["help"]]) {
cli::cli_alert_info("Tabulates a mailing list using parse_mbox().")
} else if (arguments[["tabulate"]]) {

tools_path <- arguments[["<tools.yml>"]]
conf_path <- arguments[["<project_conf.yml>"]]
Expand All @@ -51,56 +47,62 @@ if(arguments[["tabulate"]] & arguments[["help"]]){
conf <- yaml::read_yaml(conf_path)

perceval_path <- path.expand(tool[["perceval"]])
mbox_path <- path.expand(conf[["mailing_list"]][["mbox"]])

project_mbox <- parse_mbox(perceval_path,mbox_path)
mbox_file_path <- path.expand(conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]])

cli_alert_success(paste0("Tabulated mailing list was saved at: ",save_path))
project_mbox <- parse_mbox(perceval_path, mbox_file_path)

data.table::fwrite(project_mbox,save_path)
}else if(arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]){
cli_alert_info("Saves a mailing list archive from mod_mbox as a .mbox file
using download_mod_mbox().")
}else if(arguments[["download"]] & arguments[["modmbox"]]){
data.table::fwrite(project_mbox, save_path)
cli::cli_alert_success(paste0("Tabulated mailing list was saved at: ", save_path))

} else if (arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]) {
cli::cli_alert_info("Downloads mailing list archives from mod_mbox using download_mod_mbox().")
} else if (arguments[["download"]] & arguments[["modmbox"]]) {

conf_path <- arguments[["<project_conf.yml>"]]
save_path <- arguments[["<save_file_name_path>"]]
conf <- yaml::read_yaml(conf_path)
start_year_month <- arguments[["<start_year_month>"]]
end_year_month <- arguments[["<end_year_month>"]]
save_folder_path <- arguments[["<save_folder_path>"]]

mod_mbox_url <- conf[["mailing_list"]][["domain"]]
mailing_list <- conf[["mailing_list"]][["list_key"]][1]

start_year <- arguments[["<start_year>"]]
end_year <- arguments[["<end_year>"]]
conf <- yaml::read_yaml(conf_path)
mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]]

mbox <- download_mod_mbox(base_url = mod_mbox_url,
mailing_list = mailing_list,
from_year=start_year,
to_year=end_year,
save_file_path = save_path,
verbose = TRUE)
download_mod_mbox(
mailing_list = mailing_list,
start_year_month = start_year_month,
end_year_month = end_year_month,
save_folder_path = save_folder_path,
verbose = TRUE
)

cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path))
}else if(arguments[["download"]] & arguments[["modmboxmonth"]]){
cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path))

} else if (arguments[["download"]] & arguments[["pipermail"]] & arguments[["help"]]) {
cli::cli_alert_info("Downloads mailing list archives from pipermail using download_pipermail().")
} else if (arguments[["download"]] & arguments[["pipermail"]]) {

conf_path <- arguments[["<project_conf.yml>"]]
save_path <- arguments[["<save_folder_name_path>"]]
conf <- yaml::read_yaml(conf_path)

mod_mbox_url <- conf[["mailing_list"]][["domain"]]
mailing_list <- conf[["mailing_list"]][["list_key"]][1]
start_year_month <- arguments[["<start_year_month>"]]
end_year_month <- arguments[["<end_year_month>"]]
save_folder_path <- arguments[["<save_folder_path>"]]

start_year <- arguments[["<start_year>"]]
end_year <- arguments[["<end_year>"]]

mbox <- download_mod_mbox_per_month(base_url = mod_mbox_url,
mailing_list = mailing_list,
from_year=start_year,
to_year=end_year,
save_folder_path = save_path,
verbose = TRUE)

cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path))
conf <- yaml::read_yaml(conf_path)
mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]]

download_pipermail(
mailing_list = mailing_list,
start_year_month = start_year_month,
end_year_month = end_year_month,
save_folder_path = save_folder_path,
verbose = TRUE
)

cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path))

} else if (arguments[["-h"]] || arguments[["--help"]]) {
cli::cli_alert_info(doc)
} else if (arguments[["--version"]]) {
cli::cli_alert_info('Kaiaulu 0.0.0.9600')
} else {
cli::cli_alert_danger("Invalid command or arguments. Use --help for usage information.")
}

Loading

0 comments on commit 64e0646

Please sign in to comment.