Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimal Hard Thresholding to determine encoding dimension #68

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7c5f7bc
changed estimateBestQ to use OHT
andrearaithel Jun 12, 2024
c360ac0
edit estimateBestQ error messages and variable names
andrearaithel Jun 13, 2024
be10abd
add unit tests for OHT
andrearaithel Jun 13, 2024
1fb6975
update OHT tests
andrearaithel Jun 13, 2024
f2371ca
add denoiseR package to Suggests
andrearaithel Jun 16, 2024
de8d41b
add xcolor package for vignette building
andrearaithel Jun 17, 2024
e12ccaf
added my name
andrearaithel Jun 20, 2024
caa2b8e
update vignette
andrearaithel Jun 20, 2024
a2032ba
add if statement to transpose zScores if aspect ratio larger than 1
andrearaithel Jul 8, 2024
ee5ee2e
replace small latent space error by warning
andrearaithel Jul 16, 2024
2643bec
fix estimateBestQ test error
andrearaithel Jul 16, 2024
c788b5e
fix: use tinytex version 0.51 to avoid breaking changes from the newe…
andrearaithel Jul 21, 2024
ba3340e
try tinytex version 0.50
andrearaithel Jul 21, 2024
7e19fb5
try to fix gha error by reverting changes to vignette
andrearaithel Jul 21, 2024
a0c126a
add xcolor package and define fgcolor again to fix error
andrearaithel Jul 21, 2024
27437f0
add highr package
andrearaithel Jul 21, 2024
44caef1
specify knitr version to fix gha error
andrearaithel Jul 21, 2024
7e01545
revert changes to DESCRIPTION
andrearaithel Jul 21, 2024
b3fc312
revert changes to gha workflow and install specific knitr version
andrearaithel Jul 21, 2024
bf875f7
try to fix error by installing highlight package and calling it in th…
andrearaithel Jul 22, 2024
f0fd1c0
try highr package instead of highlight package
andrearaithel Jul 22, 2024
df6b4f6
try to update knitr and tinytex
andrearaithel Jul 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ jobs:
- name: Install Windows system dependencies
if: runner.os == 'Windows'
run: |
install.packages("knitr")
install.packages("highr")
## Edit below if you have any Windows system dependencies
shell: Rscript {0}

Expand All @@ -158,6 +160,7 @@ jobs:
run: |
system("tlmgr --version")
install.packages("tinytex")
tinytex::tlmgr_update()
tinytex::tlmgr_install(pkgs = c("bera", "caption", "changepage", "enumitem", "everysel", "fancyhdr", "footmisc", "grfext", "index", "marginfix", "mathtools",
"ms", "nowidow", "parnotes", "parskip", "placeins", "preprint", "ragged2e", "side", "soul", "titlesec", "tocbibind", "xstring"))
shell: Rscript {0}
Expand Down
10 changes: 7 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Authors@R: c(
email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")),
person("Agne", "Matuseviciute", role=c("aut")),
person(c("Michaela", "Fee"), "Müller", role=c("ctb")),
person("Andrea", "Raithel", role=c("ctb")),
person("Vicente", "Yepez", role=c("aut"), email="yepez@in.tum.de"),
person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
Description: Identification of aberrant gene expression in RNA-seq data.
Expand All @@ -27,7 +28,7 @@ biocViews: ImmunoOncology, RNASeq, Transcriptomics, Alignment, Sequencing,
License: MIT + file LICENSE
NeedsCompilation: yes
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Depends:
R (>= 3.6),
BiocParallel,
Expand Down Expand Up @@ -59,7 +60,9 @@ Imports:
scales,
splines,
stats,
utils
utils,
RMTstat,
pracma
Suggests:
testthat,
knitr,
Expand All @@ -73,7 +76,8 @@ Suggests:
covr,
GenomeInfoDb,
ggbio,
biovizBase
biovizBase,
denoiseR
LinkingTo:
Rcpp,
RcppArmadillo
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ exportMethods(plotManhattan)
exportMethods(plotQQ)
exportMethods(plotVolcano)
exportMethods(results)
import(RMTstat)
import(data.table, except=melt)

import(methods)
import(pracma)
importFrom(BBmisc,chunk)
importFrom(BBmisc,isFALSE)
importFrom(BBmisc,isScalarCharacter)
Expand Down
130 changes: 130 additions & 0 deletions R/controlForConfounders.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,133 @@ computeLatentSpace <- function(ods){

return(H(ods))
}

#'
#' Estimation of Q
#'
#' Estimating the best q for the given data set by Optimal Hard Thresholding
#'
#' @param ods An OutriderDataSet object
#' @param zScores A z-score matrix
#' @import RMTstat
#' @import pracma
#' @return The estimated dimension of hidden confounders
#'
#' @examples
#' ods <- makeExampleOutriderDataSet()
#'
#' estimateBestQ(ods)
#'
#' @export
estimateBestQ <- function(ods=NULL, zScores=NULL){
if (is.null(ods) & is.null(zScores)){
stop("Please provide an OutriderDataSet or a z-score matrix.")
}
else if (!is.null(ods)){
OUTRIDER:::checkOutriderDataSet(ods)
if (!is.null(zScores)){
warning("Provided z-scores are ignored and recalculated from ods.")
}

# Control for sequencing depth
if (is.null(sizeFactors(ods))){
ods <- estimateSizeFactors(ods)
}
controlledCounts <- t(t(counts(ods, normalized=FALSE)) / sizeFactors(ods))

# Log-transform controlled counts
logControlCounts <- log2((controlledCounts +1) / (rowMeans2(controlledCounts) +1))

# Compute Z-scores and control for large values
zScores <- (logControlCounts - rowMeans2(logControlCounts)) / rowSds(logControlCounts)
}
else if (!is.matrix(zScores)){
stop("Provided zScores are not a matrix.")
}
if (any(is.infinite(zScores))) {
# Check for infinite values (should be impossible!)
stop("Z-score matrix contains infinite values.")
}

# Transpose zScores if aspect ratio larger than 1
if (ncol(zScores)/nrow(zScores) > 1){
zScores <- t(zScores)
}

# Perform Singular Value Decomposition (SVD) on the matrix of Z-scores
# and extract singular values
sv <- svd(zScores)$d

# Aspect ratio of the (transposed) count matrix
numRows <- nrow(zScores)
numCols <- ncol(zScores)
beta <- numCols / numRows

# Compute the optimal w(beta)
coef <- (optimalSVHTCoef(beta) / sqrt(medianMarchenkoPastur(numCols, numRows)))

# compute cutoff
cutoff <- coef * median(sv)

# compute and return rank
if (any(sv > cutoff)){
latentDim <- max(which(sv > cutoff))
} else {
warning(paste("Optimal latent space dimension is smaller than 2. Check your count matrix and",
"verify that all samples have the expected number of counts",
"(hist(colSums(counts(ods)))).",
"For now, the latent space dimension is set to 2.", collapse = "\n"))
latentDim <- 2}
cat("Optimal encoding dimension:", latentDim, "\n")
return(latentDim)
}

#'
#' Calculate the OHT coefficient
#'
#' @noRd
optimalSVHTCoef <- function(beta){
# Calculate lambda(beta)
sqrt(2 * (beta + 1) + (8 * beta) / (beta + 1 + sqrt(beta^2 + 14 * beta + 1)))
}

#'
#' Calculate the median of the Marchenko-Pastur distribution
#'
#' Formulas are derived from a robust estimator for the noise
#' parameter gamma in the model Z~ = Z + gamma * E.
#' Gamma(Z~) = sigma / sqrt(n * mu)) with mu: median of the Marcenko-Pastur distribution.
#' More detailed explanations can be found in Gavish and Donoho 2014.
#'
#' @noRd
medianMarchenkoPastur <- function(ncol, nrow){
# Compute median of Marchenko-Pastur distribution
beta <- ncol / nrow
betaMinus <- (1 - sqrt(beta))^2
betaPlus <- (1 + sqrt(beta))^2

# Initialize range for upper integral boundary
lobnd <- copy(betaMinus)
hibnd <- copy(betaPlus)


while ((hibnd - lobnd) > 0.001){ # Iterate until convergence
x <- seq(lobnd, hibnd, length.out = 10) # Set range of values for upper integral boundary
y <- rep(0, length(x))
for (numCols in 1:length(x)){
# Approximate integral using Gauss-Kronrod Quadrature
y[numCols] <- quadgk(dmp, a=betaMinus, b=x[numCols], ndf=nrow, pdim=ncol)
}

# Set new boundaries for x that yield the closest results to 0.5
if (any(y < 0.5)){
lobnd = max(x[y < 0.5])
}

if (any(y > 0.5)){
hibnd = min(x[y > 0.5])
}
}
# If hibnd and lobnd are similar enough, return their mean
return((hibnd + lobnd) / 2.0)
}
18 changes: 0 additions & 18 deletions R/helperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,21 +242,3 @@ getBestQDT <- function(dt, usedEvalMethod='aucPR', digits=10){
dt[,encodingDimension[
seq_len(.N) == testFun(round(evaluationLoss, digits))]]
}

#'
#' Estimation of Q
#'
#' Estimating the best q for the given data set
#'
#' @param ods An OutriderDataSet object
#' @return The estimated dimension of hidden confounders
#'
#' @examples
#' ods <- makeExampleOutriderDataSet()
#'
#' estimateBestQ(ods)
#'
#' @export
estimateBestQ <- function(ods){
round(max(2, min(500, 3.7 + 0.16*ncol(ods))))
}
8 changes: 5 additions & 3 deletions man/estimateBestQ.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/normalizationFactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/sizeFactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

92 changes: 92 additions & 0 deletions tests/testthat/test_estimateBestQ.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
context("Testing the estimateBestQ function (Optimal Hard Thresholding)")

library(denoiseR)

test_that("Input validation handles NULL and non-matrix inputs", {
expect_error(estimateBestQ(),
"Please provide an OutriderDataSet or a z-score matrix.")
expect_error(estimateBestQ(NULL, NULL),
"Please provide an OutriderDataSet or a z-score matrix.")
expect_error(estimateBestQ(zScores = "not a matrix"),
"Provided zScores are not a matrix.")
expect_error(estimateBestQ(ods = "not an ods"),
"Please provide an OutriderDataSet.")

ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
package='OUTRIDER')
ctsTable <- read.table(ctsFile, check.names=FALSE)
ods <- OutriderDataSet(countData=ctsTable)
# filter out non expressed genes
ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
ods <- estimateSizeFactors(ods)

expect_warning(estimateBestQ(ods = ods, zScores = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2)),
"Provided z-scores are ignored and recalculated from ods.")
})

test_that("User is notified about invalid matrix values", {
expect_error(estimateBestQ(zScores = matrix(c(1, 2, 3, 4, 5, Inf), nrow = 3, ncol = 2)),
"Z-score matrix contains infinite values.")
})

test_that("optimalSVHTCoef works correctly", {
expect_equal(optimalSVHTCoef(0.5),
1.98, tolerance = 0.01)
expect_equal(optimalSVHTCoef(0.1),
1.58, tolerance = 0.01)
})

test_that("medianMarchenkoPastur works correctly", {
# Expected outputs are derived from Table IV in Gavish and Donoho (2014)
expect_equal(optimalSVHTCoef(0.5) / sqrt(medianMarchenkoPastur(100, 200)),
2.1711, tolerance = 0.0001)
expect_equal(optimalSVHTCoef(0.1) / sqrt(medianMarchenkoPastur(100, 1000)),
1.6089, tolerance = 0.0001)
})

test_that("Encoding dimensions are properly calculated for simulated z-scores", {
# Simulate zScore matrix consisting of signal and noise
set.seed(42)
numGenes <- 10000
numSamples <- 200
latentDim <- 50
signalNoiseRatio <- 5
zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000

expect_equal(estimateBestQ(zScores = zTilde),
latentDim)

# Simulate zScore matrix with beta > 1
set.seed(42)
numGenes <- 50
numSamples <- 200
latentDim <- 20
signalNoiseRatio <- 5
zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000

expect_equal(estimateBestQ(zScores = zTilde),
latentDim)

# Simulate zScore matrix consisting of noise only
set.seed(42)
latentDim <- 0
zTilde <- matrix(rnorm(numGenes * numSamples), nrow = numGenes, ncol = numSamples)
expect_warning(expect_equal(estimateBestQ(zScores = zTilde), 2),
paste("Optimal latent space dimension is smaller than 2\\. Check your count matrix and",
"verify that all samples have the expected number of counts",
"\\(hist\\(colSums\\(counts\\(ods\\)\\)\\)\\)\\.",
"For now\\, the latent space dimension is set to 2\\.", collapse = "\n"))
})

test_that("Encoding dimensions are properly calculated for real ODS", {
ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
package='OUTRIDER')
ctsTable <- read.table(ctsFile, check.names=FALSE)
ods <- OutriderDataSet(countData=ctsTable)
ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
ods <- estimateSizeFactors(ods)

outsingleResult <- 5 # Expected value was calculated with OutSingle
expect_equal(estimateBestQ(ods = ods), outsingleResult,
tolerance = 1)
})
Loading
Loading