Skip to content

Commit

Permalink
Merge pull request #27 from alephnull7/master
Browse files Browse the repository at this point in the history
Refactor of `PACVr.complete`; decoding `IRCheck` as a single variable; handle use of `note` qualifier for IR name
  • Loading branch information
michaelgruenstaeudl authored Feb 6, 2024
2 parents 3b6caf8 + 57c4fee commit 6efad5c
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 74 deletions.
1 change: 0 additions & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ on:
paths: ["tests/**"]
pull_request:
branches: [main, master]
paths: ["tests/**"]

name: R-CMD-check

Expand Down
7 changes: 5 additions & 2 deletions R/IRoperations.R
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,10 @@ plotIRLinks <- function(linkData, syntenyLineType) {
}
}

isSyntenyLineType <- function(syntenyLineType) {
getSyntenyLineType <- function(IRCheck) {
syntenyLineTypes <- c(1, 2)
return(syntenyLineType %in% syntenyLineTypes)
if (IRCheck %in% syntenyLineTypes) {
return(IRCheck)
}
return(NULL)
}
107 changes: 55 additions & 52 deletions R/PACVr.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ PACVr.parseSource <- function(gbkDataDF) {

PACVr.parseGenes <- function (gbkDataDF) {
# This function parses the genes of a GenBank file
logger::log_info('Parsing the different genes')
genes <- ExtractAllGenes(gbkDataDF)
return(genes)
}

PACVr.calcCoverage <-
function (bamFile, windowSize=250) {
logger::log_info('Calculating the sequencing coverage')
coverage <- CovCalc(bamFile, windowSize)
return(coverage)
}
Expand All @@ -53,30 +55,19 @@ PACVr.verboseInformation <- function(gbkData,
bamFile,
genes,
quadripRegions,
IRCheck,
analysisSpecs,
output) {
sampleName <- read.gbSampleName(gbkData)
# Step 1. Check ...
if (!is.na(output)) {
outDir <- dirname(output)
tmpDir <- file.path(outDir,
paste(sampleName["sample_name"],
".tmp",
sep=""))
if (analysisSpecs$isIRCheck) {
logger::log_info('Generating statistical information on the sequencing coverage')
verboseInformation(gbkData,
bamFile,
genes,
quadripRegions,
analysisSpecs,
output)
} else {
tmpDir <-
file.path(".", paste(sampleName["sample_name"],
".tmp",
sep=""))
}
# Step 2. Check ...
if (dir.exists(tmpDir) == FALSE) {
dir.create(tmpDir)
}
# Step 3. Write output
writeTables(quadripRegions, bamFile, genes, tmpDir, sampleName)
if (IRCheck) {
checkIREquality(gbkData, quadripRegions, tmpDir, sampleName)
logger::log_warn(paste0('Verbose output requires `IRCheck` in ',
'`', deparse(getIRCheckTypes()), '`'))
}
}

Expand Down Expand Up @@ -109,6 +100,32 @@ PACVr.visualizeWithRCircos <- function(gbkData,
)
}

PACVr.quadripRegions <- function(gbkData,
gbkDataDF,
isIRCheck) {
if (isIRCheck) {
logger::log_info('Parsing the different genome regions')
quadripRegions <- PACVr.parseQuadripRegions(gbkData,
gbkDataDF)
} else {
quadripRegions <- PACVr.parseSource(gbkDataDF)
}
return(quadripRegions)
}

PACVr.linkData <- function(genes,
quadripRegions,
syntenyLineType) {
linkData <- NULL
if (!is.null(syntenyLineType)) {
logger::log_info('Inferring the IR regions and the genes within the IRs')
linkData <- PACVr.generateIRGeneData(genes,
quadripRegions,
syntenyLineType)
}
return(linkData)
}

#' @title Execute the complete pipeline of \pkg{PACVr}
#' @description This function executes the complete pipeline of \pkg{PACVr}
#' via a single command.
Expand Down Expand Up @@ -173,53 +190,39 @@ PACVr.complete <- function(gbkFile,
output=NA) {
######################################################################
gbkData <- PACVr.read.gb(gbkFile)
isIRCheck <- getIsIRCheck(IRCheck)
gbkDataDF <- read.gb2DF(gbkData, isIRCheck)
analysisSpecs <- getAnalysisSpecs(IRCheck)
gbkDataDF <- read.gb2DF(gbkData,
analysisSpecs)
if (is.null(gbkDataDF)) {
logger::log_error(paste("No usable data to perform specified analysis"))
return(NULL)
}

###################################
if (isIRCheck) {
logger::log_info('Parsing the different genome regions')
quadripRegions <- PACVr.parseQuadripRegions(gbkData,
gbkDataDF)
} else {
quadripRegions <- PACVr.parseQuadripRegions(gbkDataDF)
}
quadripRegions <- PACVr.quadripRegions(gbkData,
gbkDataDF,
analysisSpecs$isIRCheck)

###################################
logger::log_info('Parsing the different genes')
genes <- PACVr.parseGenes(gbkDataDF)

###################################
logger::log_info('Calculating the sequencing coverage')
coverage <- PACVr.calcCoverage(bamFile,
windowSize)

###################################
linkData <- NULL
IRCheck <- isSyntenyLineType(IRCheck)
if (IRCheck) {
logger::log_info('Inferring the IR regions and the genes within the IRs')
linkData <- PACVr.generateIRGeneData(genes,
quadripRegions,
IRCheck)
}
linkData <- PACVr.linkData(genes,
quadripRegions,
analysisSpecs$syntenyLineType)

###################################
if (isIRCheck && verbose) {
logger::log_info('Generating statistical information on the sequencing coverage')
PACVr.verboseInformation(gbkData,
bamFile,
genes,
quadripRegions,
IRCheck,
output)
} else if (verbose) {
logger::log_warn(paste0('Verbose output requires `IRCheck` in ',
'`', deparse(getIRCheckTypes()), '`'))
if (verbose) {
PACVr.verboseInformation(gbkData,
bamFile,
genes,
quadripRegions,
analysisSpecs,
output)
}

###################################
Expand Down
89 changes: 72 additions & 17 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.02.01.1736"

read.gb2DF <- function(gbkData, IRPresenceAndSyntenyCheck) {
read.gb2DF <- function(gbkData, analysisSpecs) {
fileDF <- data.frame()
for (sample in gbkData) {
sampleDF <- parseFeatures(sample$FEATURES, IRPresenceAndSyntenyCheck)
sampleDF <- parseFeatures(sample$FEATURES, analysisSpecs)
if (!is.null(sampleDF)) {
fileDF <- dplyr::bind_rows(fileDF, sampleDF)
}
Expand All @@ -18,7 +18,7 @@ read.gb2DF <- function(gbkData, IRPresenceAndSyntenyCheck) {
return(fileDF)
}

parseFeatures <- function(features, IRPresenceAndSyntenyCheck) {
parseFeatures <- function(features, analysisSpecs) {
sampleDF <- data.frame()
for (feature in features) {
feature <- parseFeature(feature)
Expand All @@ -27,7 +27,7 @@ parseFeatures <- function(features, IRPresenceAndSyntenyCheck) {
}
}
# check if can we can use the sample
subsetCols <- checkFeatureQualifiers(sampleDF, IRPresenceAndSyntenyCheck)
subsetCols <- checkFeatureQualifiers(sampleDF, analysisSpecs)
if (is.null(subsetCols)) {
return(NULL)
}
Expand Down Expand Up @@ -187,12 +187,9 @@ read.gbGenes <- function(gbkDataDF) {

read.gbOther <- function(gbkDataDF) {
type <- NULL
subsetCols <- c("seqnames", "start", "end",
"gene", "note", "standard_name")
regions <- gbkDataDF %>%
dplyr::filter(!type %in% c("gene", "exon", "transcript",
"CDS", "variant")) %>%
dplyr::select(dplyr::all_of(subsetCols))
"CDS", "variant"))
rownames(regions) <- NULL
return(regions)
}
Expand Down Expand Up @@ -372,26 +369,84 @@ validateColors <- function(colorsToValidate) {
}
}

checkFeatureQualifiers <- function(sampleDF, IRPresenceAndSyntenyCheck) {
subsetCols <- c("gene", "note", "type")
if (IRPresenceAndSyntenyCheck) {
subsetCols <- c(subsetCols, "standard_name")
}
missingCols <- subsetCols[!(subsetCols %in% colnames(sampleDF))]
if (length(missingCols) > 0) {
logger::log_warn(paste0("Unable to analyze sample as specified; ",
checkFeatureQualifiers <- function(sampleDF, analysisSpecs) {
subsetData <- getSubsetData(sampleDF, analysisSpecs)
if (length(subsetData$missingCols) > 0) {
logger::log_warn(paste0("Unable to analyze sample as specified; ",
"missing feature qualifiers: ",
"'",
paste(missingCols, collapse = "', '"),
"'"))
return(NULL)
}
# add future generated columns that are needed
subsetCols <- c(subsetCols, "start", "end", "seqnames")
subsetCols <- c(subsetData$subsetCols, "start", "end", "seqnames")
return(subsetCols)
}

getSubsetCols <- function(analysisSpecs) {
subsetCols <- c("gene", "note", "type")
if (analysisSpecs$isIRCheck) {
subsetCols <- c(subsetCols, "standard_name")
}
return(subsetCols)
}

getSubsetData <- function(sampleDF, analysisSpecs) {
subsetCols <- getSubsetCols(analysisSpecs)
missingCols <- subsetCols[!(subsetCols %in% colnames(sampleDF))]
if (analysisSpecs$isIRCheck && ("standard_name" %in% missingCols)) {
logger::log_info("Using `note` for IR name qualifier")
subsetCols <- subsetCols[subsetCols != "standard_name"]
missingCols <- missingCols[missingCols != "standard_name"]
}
subsetData <- list(
subsetCols = subsetCols,
missingCols = missingCols
)
return(subsetData)
}

isIgnoredFeature <- function(featureName) {
ignoredFeatures <- c("D-loop")
return(featureName %in% ignoredFeatures)
}

getAnalysisSpecs <- function(IRCheck) {
analysisSpecs <- list(
syntenyLineType = getSyntenyLineType(IRCheck),
isIRCheck = getIsIRCheck(IRCheck)
)
return(analysisSpecs)
}

verboseInformation <- function(gbkData,
bamFile,
genes,
quadripRegions,
analysisSpecs,
output) {
sampleName <- read.gbSampleName(gbkData)
# Step 1. Check ...
if (!is.na(output)) {
outDir <- dirname(output)
tmpDir <- file.path(outDir,
paste(sampleName["sample_name"],
".tmp",
sep=""))
} else {
tmpDir <-
file.path(".", paste(sampleName["sample_name"],
".tmp",
sep=""))
}
# Step 2. Check ...
if (dir.exists(tmpDir) == FALSE) {
dir.create(tmpDir)
}
# Step 3. Write output
writeTables(quadripRegions, bamFile, genes, tmpDir, sampleName)
if (!is.null(analysisSpecs$syntenyLineType)) {
checkIREquality(gbkData, quadripRegions, tmpDir, sampleName)
}
}
4 changes: 2 additions & 2 deletions inst/extdata/PACVr_Rscript.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.02.02.2100"
#version="2024.02.05.2100"

library("optparse")

Expand Down Expand Up @@ -75,7 +75,7 @@ CmdLineArgs <- function() {
dest = "verbose",
help = paste("a boolean, that when TRUE, generates additional files with",
"detailed genomic region information;",
"requires a `regionsCheck` value that will perform region analysis"),
"requires a `IRCheck` value that will perform region analysis"),
metavar = "logical"),
make_option(opt_str = c("-o","--output"),
type = "character",
Expand Down

0 comments on commit 6efad5c

Please sign in to comment.