Merge branch 'galaxyproject:main' into add-ncbi-egapx

galaxyproject · Oct 15, 2024 · 3779ca4 · 3779ca4
2 parents 7f45fc8 + fb87911
commit 3779ca4
Show file tree

Hide file tree

Showing 24 changed files with 2,028 additions and 55 deletions.
diff --git a/tools/decontam/.shed.yml b/tools/decontam/.shed.yml
@@ -0,0 +1,13 @@
+name: decontam
+owner: iuc
+description: Removes decontamination features (ASVs/OTUs) using control samples
+long_description: |
+  The decontam package provides simple statistical methods to identify 
+  and visualize contaminating DNA features, allowing them to be removed 
+  and a more accurate picture of sampled communities to be constructed
+  from marker-gene and metagenomics data.
+categories:
+- Metagenomics
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam
+homepage_url: https://github.com/benjjneb/decontam
+type: unrestricted
diff --git a/tools/decontam/decontam.xml b/tools/decontam/decontam.xml
@@ -0,0 +1,173 @@
+<tool id="decontam" name="Decontam" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="bio_tools"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        Rscript '$rscript'
+    ]]></command>
+    <configfiles>
+        <configfile name="rscript"><![CDATA[
+            library(tidyverse)
+            library(phyloseq)
+            library(ggplot2)
+            library(decontam)
+
+            #if $input_type.select_input == 'phyloseq':
+                ps <- readRDS("$input_type.phyloseq_object")
+                sample_data(ps)\$control <- as.logical(sample_data(ps)[["$input_type.control_metadata"]])
+            #else
+                ## get OTU table (first column is the OTU/ASV ID)
+                otu <- read_tsv("$input_type.otu")
+                otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) #use first column as rownames
+                OTU <- otu_table(otu2,  taxa_are_rows = FALSE)
+
+                ## get metadata table must have matching OTU/ASV ID in first column
+                meta <- read_tsv("$input_type.metadata")
+                meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) #use first column as rownames
+
+                control_column = as.integer("$input_type.control") - 1 ##remove one index since the dataframe uses the first column as index
+
+                ## convert 0/1 to bool for the control column and store in control column
+                meta2\$control <- as.logical(meta2[[control_column]])
+                sampledata <- sample_data(meta2)
+
+                ps <- phyloseq(OTU, FALSE, sampledata)
+
+            #end if
+
+            ## plot library_size_vs_control
+            df <- as.data.frame(sample_data(ps)) # Put sample_data into a ggplot-friendly data.frame
+            df\$LibrarySize <- sample_sums(ps)
+            df <- df[order(df\$LibrarySize),]
+            df\$Index <- seq(nrow(df))
+            ggplot(data=df, aes(x=Index, y=LibrarySize, color=control)) + geom_point()
+            ggsave("$library_size_vs_control", device = "png", width = 10, height = 8, units = "cm")
+
+            ## plot prevalence 
+            contamdf.prev <- isContaminant(ps, method="prevalence", neg="control", threshold=$threshold)
+            table(contamdf.prev\$contaminant)
+
+            ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
+            ps.pa.neg <- prune_samples(sample_data(ps.pa)\$control == TRUE, ps.pa)
+            ps.pa.pos <- prune_samples(sample_data(ps.pa)\$control == FALSE, ps.pa)
+
+            ## Make data.frame of prevalence in positive and negative samples
+            df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
+                                contaminant=contamdf.prev\$contaminant)
+            ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
+            xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")
+            ggsave("$prevalence", device = "png", width = 10, height = 8, units = "cm")
+
+            ## remove contamination features from original data
+            #if $input_type.select_input == 'phyloseq':
+                id_name <- "SampleID"
+            #else
+                id_name <- colnames(otu)[1] ## we use the same name for the ID column as the OTU input
+            #end if
+
+            ps.noncontam <- prune_taxa(!contamdf.prev\$contaminant, ps)
+
+            otu_table(ps.noncontam) %>%
+            as.data.frame() %>%
+            rownames_to_column(id_name) -> otu
+
+            write.table(otu,
+                file="$decontam_otu", 
+                sep = "\t",
+                row.names=FALSE,
+                quote = FALSE)
+
+            saveRDS(ps.noncontam, "$decontam_phyloseq")
+
+        ]]></configfile>
+    </configfiles>
+    <inputs>
+        <conditional name="input_type">
+            <param name="select_input" type="select" label="Phyloseq or Feature table input" help="This tool can work with phyloseq objects or feature table inputs.">
+                <option value="phyloseq">Phyloseq</option>
+                <option value="feature_table">Feature table</option>
+            </param>
+            <when value="phyloseq">
+                <param name="phyloseq_object" type="data" format="phyloseq" label="Phyloseq object"/>
+                <param name="control_metadata" type="text" label="Control column" help="Column in the phyloseq metadata specifying weather a sample is a negative control (0 for normal samples / 1 for control)"/>
+            </when>
+            <when value="feature_table">
+                <param name="otu" type="data" format="tabular" label="Feature table" help="OTU/ASV or other feature table. The first column must have corresponding IDs to the metadata table."/>
+                <param name="metadata" type="data" format="tabular" label="Metadata" help="Metadata that contains a column specifying weather a samples is a negativ control (0 for normal samples / 1 for control). The first column must have corresponding IDs to the feature table."/>
+                <param name="control" type="data_column" data_ref="metadata" use_header_names="true" multiple="false" optional="false" label="Control column" help="Column specifying weather a sample is a negative control (0 for normal samples / 1 for control)."/>
+            </when>
+        </conditional>
+        <param name="threshold" type="float" label="Threshold to detect a contaminant" value="0.1" min="0" max="1" help="Probability of the feature to be a decontaminant in the statistical test being performed." />    
+    </inputs>
+    <outputs>
+        <data name="library_size_vs_control" format="png" label="${tool.name} on ${on_string}: Library Size vs Control Plot"/>
+        <data name="prevalence" format="png" label="${tool.name} on ${on_string}: Prevalence Plot"/>
+        <data name="decontam_otu" format="tabular" label="${tool.name} on ${on_string}: Removed Contaminants - Feature Table"/>
+        <data name="decontam_phyloseq" format="phyloseq" label="${tool.name} on ${on_string}: Removed Contaminants - Phyloseq Object"/>
+    </outputs>
+    <tests>
+        <test>
+            <conditional name="input_type">
+                <param name="select_input" value="phyloseq"/>
+                <param name="phyloseq_object" value="phyloseq_input.rds"/>
+                <param name="control_metadata" value="control"/>
+            </conditional>
+            <param name="threshold" value="0.5"/>
+            <output name="decontam_phyloseq" file="phyloseq_output.rds" ftype="phyloseq"/>
+            <output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/>
+            <output name="prevalence" file="Prevalence_Plot.png" ftype="png"/>
+            <output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/>
+        </test>
+        <test>
+            <conditional name="input_type">
+                    <param name="select_input" value="feature_table"/>
+                    <param name="otu" value="otu_input.tsv"/>
+                    <param name="metadata" value="metadata_input.tsv"/>
+                    <!-- using the index of the column -->
+                    <param name="control" value="8"/> 
+            </conditional>
+            <param name="threshold" value="0.5"/>
+            <output name="decontam_phyloseq" file="phyloseq_output2.rds" ftype="phyloseq"/> 
+            <output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/>
+            <output name="prevalence" file="Prevalence_Plot.png" ftype="png"/>
+            <output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Simple Statistical Identification of Contaminating Sequence Features in Marker-Gene or Metagenomics Data
+========================================================================================================
+
+This tool identifies contaminating sequence features in marker-gene or
+metagenomics datasets. It can be applied to any type of feature derived from
+environmental sequencing data (e.g., ASVs, OTUs, taxonomic groups, MAGs,
+etc.). The method requires either DNA quantitation data or sequenced negative
+control samples.
+
+.. note::
+
+   Currently, only the negative control sample method is implemented in this
+   wrapper.
+
+**Output**
+
+- If a phyloseq object is provided as input, the output will be a phyloseq
+  object pruned to include only non-contaminant features.
+- If only the feature table or metadata is provided, the output will be a
+  pruned phyloseq object containing only non-contaminant features, without
+  the TAX table. The output feature table will also be pruned to include only
+  non-contaminant features.
+
+**Threshold**
+
+The default threshold for identifying a contaminant is a probability of 0.1
+in the statistical test being performed.
+
+In the prevalence test, a special threshold value of 0.5 is notable: this
+will identify as contaminants any sequences that are more prevalent in
+negative controls than in positive samples.
+    ]]>
+    </help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/decontam/macros.xml b/tools/decontam/macros.xml
@@ -0,0 +1,23 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.22</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">22.01</token>
+    <xml name="bio_tools">
+        <xrefs>
+            <xref type="bio.tools">decontam</xref>
+            <xref type="bioconductor">decontam</xref>
+        </xrefs>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">bioconductor-decontam</requirement>
+            <requirement type="package" version="1.46.0" >bioconductor-phyloseq</requirement>
+            <requirement type="package" version="2.0.0">r-tidyverse</requirement>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1186/s40168-018-0605-2</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/decontam/test-data/Library_Size_vs_Control_Plot.png b/tools/decontam/test-data/Library_Size_vs_Control_Plot.png
diff --git a/tools/decontam/test-data/Prevalence_Plot.png b/tools/decontam/test-data/Prevalence_Plot.png
diff --git a/tools/decontam/test-data/decontam_Rscript.Rmd b/tools/decontam/test-data/decontam_Rscript.Rmd
@@ -0,0 +1,176 @@
+---
+title: "decontam_docs"
+output: html_document
+date: "2024-09-10"
+---
+
+# This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Install test env and run studio in this env
+
+```{bash install}
+mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse
+mamba activate decontam
+rstudio
+```
+
+### Check correct R home
+
+```{r home}
+R.home()
+```
+
+## Get test data
+
+Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input.
+
+```{r store test data}
+R.home()
+library(phyloseq)
+packageVersion("phyloseq")
+library(ggplot2)
+packageVersion("ggplot2")
+library(decontam)
+packageVersion("decontam")
+
+ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam"))
+
+# Sample from a physeq object with a sampling function.
+#   ps: physeq object to be sampled
+#   fun: function to use for sampling (default `sample`)
+#   ...: parameters to be passed to fun,
+# see `help(sample)` for default parameters
+sample_ps <- function(ps, fun = sample, ...) {
+    ids <- sample_names(ps)
+    sampled_ids <- fun(ids, ...)
+    ps <- prune_samples(sampled_ids, ps)
+    return(ps)
+}
+
+# the initial object is to big for the test case so we subsample
+ps <- sample_ps(ps, size = 200)
+
+## ps
+# get otu table
+otu <- as(otu_table(ps), "matrix")
+head(otu[, 1:10], 10)
+
+# add control column to sample data
+sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample"
+# store as 0 and 1
+sample_data(ps)$control <- as.integer(sample_data(ps)$control)
+head(sample_data(ps), 1000)
+
+metadata <- as(sample_data(ps), "matrix")
+head(metadata, 1000)
+
+# store test data
+# stores the row names as column,
+# see https://stackoverflow.com/questions/2478352
+# /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames
+write.table(data.frame("SampleID" = rownames(otu), otu),
+    file = file.path(getwd(), "otu_input.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+    quote = FALSE
+)
+write.table(data.frame("SampleID" = rownames(metadata), metadata),
+    file = file.path(getwd(), "metadata_input.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+    quote = FALSE
+)
+
+saveRDS(ps, file.path(getwd(), "phyloseq_input.rds"))
+```
+
+## Load test data
+
+```{r load test data}
+library(tidyverse)
+
+# get OTU table (first column is the OTU/ASV ID)
+otu <- read_tsv(file.path(getwd(), "otu_input.tsv"))
+# use first column as colname
+otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1])
+otu <- otu_table(otu2, taxa_are_rows = FALSE)
+
+# get metadata table must have matching OTU/ASV ID in first column
+meta <- read_tsv(file.path(getwd(), "metadata_input.tsv"))
+# use first column as colname
+
+meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1])
+control_column <- "control"
+
+# convert 0/1 to bool for the control column and store in control column
+meta2$control <- as.logical(meta2[[control_column]])
+sampledata <- sample_data(meta2)
+
+# create dummy tax table (actually not needed,
+# but nice to learn how to load phyloseq objects)
+taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE),
+    nrow = ncol(otu2), ncol = 7
+))
+rownames(taxmat) <- colnames(otu2)
+tax <- tax_table(as.matrix(taxmat))
+
+ps <- phyloseq(otu, tax, sampledata)
+```
+
+# plot 1
+
+```{r plot library size vs control}
+# Put sample_data into a ggplot-friendly data.frame
+df <- as.data.frame(sample_data(ps))
+df$LibrarySize <- sample_sums(ps)
+df <- df[order(df$LibrarySize), ]
+df$Index <- seq_len(nrow(df))
+ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) +
+    geom_point()
+```
+
+# plot 2
+
+```{r plot prevalence}
+contamdf_prev <- isContaminant(ps,
+    method = "prevalence",
+    neg = "control",
+    threshold = 0.5
+)
+table(contamdf_prev$contaminant)
+
+ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0))
+ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa)
+ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa)
+# Make data_frame of prevalence in positive and negative samples
+df_pa <- data.frame(
+    pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg),
+    contaminant = contamdf_prev$contaminant
+)
+ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) +
+    geom_point() +
+    xlab("Prevalence (Negative Controls)") +
+    ylab("Prevalence (True Samples)")
+```
+
+# generate output
+
+```{r remove contams}
+id_name <- colnames(otu)[1]
+
+ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps)
+
+otu_table(ps_noncontam) %>%
+    as.data.frame() %>%
+    rownames_to_column(id_name) <- otu
+
+write.table(otu,
+    file = file.path(getwd(), "otu_output.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+)
+```