diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml
index aa0e6b2..08622fd 100644
--- a/.github/workflows/download_pipeline.yml
+++ b/.github/workflows/download_pipeline.yml
@@ -11,14 +11,14 @@ on:
description: "The specific branch you wish to utilize for the test execution of nf-core download."
required: true
default: "dev"
-# pull_request:
-# types:
-# - opened
-# branches:
-# - master
-# pull_request_target:
-# branches:
-# - master
+ pull_request:
+ types:
+ - opened
+ branches:
+ - master
+ pull_request_target:
+ branches:
+ - master
env:
NXF_ANSI_LOG: false
diff --git a/docs/images/luksza2021_fig3.png b/docs/images/luksza2021_fig3.png
new file mode 100644
index 0000000..0c87548
Binary files /dev/null and b/docs/images/luksza2021_fig3.png differ
diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png
deleted file mode 100755
index 361d0e4..0000000
Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ
diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png
deleted file mode 100755
index cb39ebb..0000000
Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ
diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png
deleted file mode 100755
index a4b89bf..0000000
Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ
diff --git a/docs/output.md b/docs/output.md
index 7580a0e..012467e 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -12,48 +12,95 @@ The directories listed below will be created in the results directory after the
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
-- [FastQC](#fastqc) - Raw read QC
-- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
-- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+1. Create phylogenetic trees using [PhyloWGS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8)
+2. Use [netMHCpan-4](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) to calculate binding affinities
+3. Use [netMHCpanStab](https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/) to calculate stability scores
+4. Use Luksza et al.'s neoantigen quality and fitness computations tool ([NeoantigenEditing](https://github.com/LukszaLab/NeoantigenEditing)) to evaluate peptides
-### FastQC
+### PhyloWGS
Output files
-- `fastqc/`
- - `*_fastqc.html`: FastQC report containing quality metrics.
- - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+- `phylowgs/`
+ - `*_.summ.json.gz`: Output file for JSON-formatted tree summaries
+ - `*.muts.json.gz`: Output file for JSON-formatted list of mutations
+ - `*.muts.json.gz`: Output file for JSON-formatted list of mutations
+ - `*.muts.json.gz`: Output zipped folder for JSON-formatted list of SSMs and CNVs
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+### netMHCpan
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
-
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
+
+Output files
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
+- `netmhcpan/`
+ - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens
+ - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file.
-:::note
-The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
-:::
+
-### MultiQC
+### netMHCstabpan
Output files
-- `multiqc/`
- - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
- - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
- - `multiqc_plots/`: directory containing static images from the report in various formats.
+- `netmhcstabpan/`
+ - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens
+ - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+### Neoantigen Ediitng Final Output
+
+
+Output files
+
+- `neoantigenediting/`
+
+ - `*._annotated.json`: The final output of the pipeline. This file is an annotated version of the tree output from phyloWGS with an extra property titled 'neoantigens'. Each entry in 'neoantigens' is a property with properties describing the neoantigen. These neoantigen properities are described below
+
+ "id": "XSYI_MG_M_9_C1203_11",
+
+ "mutation_id": "X_72667534_C_G",
+
+ "HLA_gene_id": "HLA-C\*12:03",
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
+ "sequence": "ASRSRHSPY",
+
+ "WT_sequence": "PSRSRHSPY",
+
+ "mutated_position": 1,
+
+ "Kd": 192.03,
+
+ "KdWT": 4582.17,
+
+ "R": 0.8911371281207195,
+
+ "logC": 2.263955023939215,
+
+ "logA": 3.1722763542054815,
+
+ "quality": 2.645601185190205
+
+ The above is an example output from a run. Each neoantigenic mutation will have an output like this.
+
+ - id: This is a unique id that combines an id created from the mutation, HLA allele, and window.
+ - mutation_id : ID containing the chromosome, position, ref and alt allele. I and D denote insertions and deletions respectively.
+ - HLA_gene_id : The HLA gene this neoantigen binds to
+ - sequence : Mutated sequence
+ - WT_sequence : The wild type sequence
+ - mutated_position : The position of the first difference
+ - Kd: Binding affinity in nM from netMHCpan for the mutated peptide
+ - kdWT : Binding affinity in nM from netMHCpan for the wild type peptide
+ - R : Similarity of mutated peptide to IEDB peptides
+ - logC : the log of the cross-reactivity
+ - logA : Log of the amplitude. This is a function of kd/kdWT and a constant
+ - quality: The final output of the pipeline and neoantigen editing. A higher quality is a better neoantigen. This is decribed in the Luksza et al. paper and is visualized below
+
+
### Pipeline information
diff --git a/modules.json b/modules.json
index 1a9db25..bf1a62f 100644
--- a/modules.json
+++ b/modules.json
@@ -6,27 +6,27 @@
"modules": {
"msk": {
"neoantigenediting/aligntoiedb": {
- "branch": "neoantigen",
- "git_sha": "14ae2b3db25701828835e7145cee14dfdaddf180",
+ "branch": "develop",
+ "git_sha": "cac9c047e374ee259fb612ba5816e7e6aae6b86f",
"installed_by": ["neoantigen_editing"]
},
"neoantigenediting/computefitness": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "1f65c2ecdc5010549055ff7f4e6b8bccee48d4ae",
"installed_by": ["neoantigen_editing"]
},
"neoantigenutils/formatnetmhcpan": {
- "branch": "neoantigen",
- "git_sha": "fceccd3f96fec678849bb9bc0c04e53d9965f973",
+ "branch": "develop",
+ "git_sha": "c5d1252252e15555abcc82ea537cebeb281a1856",
"installed_by": ["netmhcstabandpan"]
},
"neoantigenutils/generatehlastring": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970",
"installed_by": ["netmhcstabandpan"]
},
"neoantigenutils/generatemutfasta": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "bb7975c796ab9a2d7a45ef733a6a226a0f5ad74a",
"installed_by": ["netmhcstabandpan"]
},
@@ -36,32 +36,32 @@
"installed_by": ["modules"]
},
"netmhcpan": {
- "branch": "neoantigen",
- "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970",
+ "branch": "develop",
+ "git_sha": "503abeb67260f060d8228221b07d743aa4180345",
"installed_by": ["modules", "netmhcstabandpan"]
},
"netmhcstabpan": {
- "branch": "neoantigen",
- "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970",
+ "branch": "develop",
+ "git_sha": "c1a473f8bc08f778269a36ab62d5adf24357225f",
"installed_by": ["modules", "netmhcstabandpan"]
},
"phylowgs/createinput": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "b031249dcf4279606c25e626da2a628756e75e8a",
"installed_by": ["phylowgs"]
},
"phylowgs/multievolve": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "535662d391a3533dea3b11c462c14799227e08b2",
"installed_by": ["phylowgs"]
},
"phylowgs/parsecnvs": {
- "branch": "neoantigen",
- "git_sha": "064ce5b42a8f711fb4d8107150aad2d382ae99c2",
+ "branch": "develop",
+ "git_sha": "8471691d7c29bc2f5f4fb92279c94fb2640b6c38",
"installed_by": ["phylowgs"]
},
"phylowgs/writeresults": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "6d27f08bf649e8680ace321d3127dcdf0e210973",
"installed_by": ["phylowgs"]
}
@@ -70,17 +70,17 @@
"subworkflows": {
"msk": {
"neoantigen_editing": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "56a628201401866096d6307b9e8c690c5eb46ac2",
"installed_by": ["subworkflows"]
},
"netmhcstabandpan": {
- "branch": "neoantigen",
- "git_sha": "1b7ac020798572be26402a72dd9c1a22ce849a63",
+ "branch": "develop",
+ "git_sha": "d60211568e3709e9284bc06eef938e361d474d08",
"installed_by": ["subworkflows"]
},
"phylowgs": {
- "branch": "neoantigen",
+ "branch": "develop",
"git_sha": "a5d61394af346f21ee2eb7ecfd97ab25bdbd1d0e",
"installed_by": ["subworkflows"]
}
@@ -90,14 +90,9 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
- "fastqc": {
- "branch": "master",
- "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd",
- "installed_by": ["modules"]
- },
"multiqc": {
"branch": "master",
- "git_sha": "314d742bdb357a1df5f9b88427b3b6ac78aa33f7",
+ "git_sha": "b80f5fd12ff7c43938f424dd76392a2704fa2396",
"installed_by": ["modules"]
}
}
diff --git a/modules/msk/neoantigenediting/aligntoiedb/meta.yml b/modules/msk/neoantigenediting/aligntoiedb/meta.yml
index 7a097b3..77d6121 100644
--- a/modules/msk/neoantigenediting/aligntoiedb/meta.yml
+++ b/modules/msk/neoantigenediting/aligntoiedb/meta.yml
@@ -1,7 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "neoantigenediting_aligntoiedb"
-description: Align neoantigens to the IEDB resource
+description: Align neoantigens to the IEDB file
keywords:
- neoantigenediting
- neoantigens
diff --git a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py
index 20886ca..9ff645a 100755
--- a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py
+++ b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py
@@ -47,14 +47,20 @@ def load_blosum62_mat():
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
"""
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
- blosum62_mat_str_list = [l.split() for l in raw_blosum62_mat_str.strip().split("\n")]
+ blosum62_mat_str_list = [
+ l.split() for l in raw_blosum62_mat_str.strip().split("\n")
+ ]
blosum_aa_order = [blosum62_mat_str_list[0].index(aa) for aa in amino_acids]
blosum62_mat = np.zeros((len(amino_acids), len(amino_acids)))
for i, bl_ind in enumerate(blosum_aa_order):
- blosum62_mat[i] = np.array([int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]])[blosum_aa_order]
+ blosum62_mat[i] = np.array(
+ [int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]]
+ )[blosum_aa_order]
blosum62 = {
- (aaA, aaB): blosum62_mat[i, j] for i, aaA in enumerate(amino_acids) for j, aaB in enumerate(amino_acids)
+ (aaA, aaB): blosum62_mat[i, j]
+ for i, aaA in enumerate(amino_acids)
+ for j, aaB in enumerate(amino_acids)
}
return blosum62
@@ -226,7 +232,9 @@ def load_epitopes(iedbfasta):
pjson = json.load(f)
patient = pjson["patient"]
neoantigens = pjson["neoantigens"]
- peptides = set([("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens])
+ peptides = set(
+ [("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens]
+ )
pepseq2pepid = defaultdict(set)
for pep_id, pep_seq in peptides:
pepseq2pepid[pep_seq].add(pep_id)
@@ -251,5 +259,7 @@ def load_epitopes(iedbfasta):
"Alignment_score",
]
else:
- aln_data = pd.DataFrame(columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"])
+ aln_data = pd.DataFrame(
+ columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"]
+ )
aln_data.to_csv("iedb_alignments_" + patient + ".txt", sep="\t", index=False)
diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py
index 03d78c4..f278817 100755
--- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py
+++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py
@@ -11,7 +11,6 @@
import os
-# %
class EpitopeDistance(object):
"""Base class for epitope crossreactivity.
@@ -39,7 +38,9 @@ class EpitopeDistance(object):
def __init__(
self,
- model_file=os.path.join(os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json"),
+ model_file=os.path.join(
+ os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json"
+ ),
amino_acids="ACDEFGHIKLMNPQRSTVWY",
):
"""Initialize class and compute M_ab."""
@@ -75,5 +76,11 @@ def epitope_dist(self, epiA, epiB):
"""
return sum(
- [self.d_i[i] * self.M_ab[self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]]] for i in range(9)]
+ [
+ self.d_i[i]
+ * self.M_ab[
+ self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]]
+ ]
+ for i in range(9)
+ ]
)
diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py
index bc8cd54..da19f4d 100755
--- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py
+++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py
@@ -60,7 +60,9 @@ def fill_up_clone_neoantigens(tree, mut2neo):
while len(nodes) > 0:
node = nodes[0]
nodes = nodes[1:]
- node["neoantigens"] = [neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid]]
+ node["neoantigens"] = [
+ neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid]
+ ]
node["neoantigen_load"] = len(node["neoantigens"])
node["NA_Mut"] = sum([len(mut2neo[mid]) > 0 for mid in node["all_mutations"]])
if "children" in node:
@@ -157,7 +159,9 @@ def compute_effective_sample_size(sample_json):
for clone_muts, X in zip(clone_muts_list, freqs):
for mid in clone_muts:
mut_freqs[mid].append(X)
- avev = np.mean([np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs])
+ avev = np.mean(
+ [np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs]
+ )
n = 1 / avev
return n
diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf
index 793432e..d6d67d1 100644
--- a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf
+++ b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf
@@ -36,10 +36,10 @@ process NEOANTIGENUTILS_FORMATNETMHCPAN {
stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
+ def netmhcOutputType = meta.typeMut ? "MUT": "WT"
+ def netmhcOutputFrom = meta.fromStab ? "STAB": "PAN"
"""
-
- touch ${prefix}.MUT.tsv
- touch ${prefix}.WT.tsv
+ touch ${prefix}.${netmhcOutputType}.${netmhcOutputFrom}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
formatNetmhcpanOutput: \$(echo \$(format_netmhcpan_output.py -v))
diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py
index 3b81dcb..a11be09 100755
--- a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py
+++ b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py
@@ -71,7 +71,9 @@ def netMHCpan_out_reformat(netMHCpanoutput, mut, stab, prefix):
def parse_args():
parser = argparse.ArgumentParser(description="Process input files and parameters")
- parser.add_argument("--netMHCpan_output", required=True, help="Path to netMHC output")
+ parser.add_argument(
+ "--netMHCpan_output", required=True, help="Path to netMHC output"
+ )
parser.add_argument("--type_MUT", action="store_true", help="Output is a MUT type")
parser.add_argument(
"--from_STAB",
@@ -79,13 +81,17 @@ def parse_args():
help="Output is from netmhcstab",
)
parser.add_argument("--id", required=True, help="Prefix to label the output")
- parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION))
+ parser.add_argument(
+ "-v", "--version", action="version", version="%(prog)s {}".format(VERSION)
+ )
return parser.parse_args()
def main(args):
- netMHCpan_out_reformat(args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id)
+ netMHCpan_out_reformat(
+ args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id
+ )
if __name__ == "__main__":
diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap
index 28d29b9..685825a 100644
--- a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap
+++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap
@@ -30,11 +30,7 @@
]
}
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-13T12:57:40.919561"
+ "timestamp": "2024-07-30T13:46:27.878268"
},
"neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv - stub": {
"content": [
@@ -46,10 +42,7 @@
"typeMut": true,
"fromStab": false
},
- [
- "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
- "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
- ]
+ "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"1": [
@@ -62,10 +55,7 @@
"typeMut": true,
"fromStab": false
},
- [
- "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
- "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
- ]
+ "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
@@ -73,11 +63,7 @@
]
}
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-13T12:58:01.264912"
+ "timestamp": "2024-07-30T13:47:05.72509"
},
"neoantigenutils_formatnetmhcpan - output(WT,netmhcpan) - tsv": {
"content": [
@@ -110,11 +96,7 @@
]
}
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-13T12:57:46.381516"
+ "timestamp": "2024-07-30T13:46:37.183992"
},
"neoantigenutils_formatnetmhcpan - output(MUT,netmhcpanstab) - tsv": {
"content": [
@@ -147,11 +129,7 @@
]
}
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-13T12:57:51.691253"
+ "timestamp": "2024-07-30T13:46:47.110076"
},
"neoantigenutils_formatnetmhcpan - output(WT,netmhcpanstab) - tsv": {
"content": [
@@ -184,10 +162,6 @@
]
}
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-13T12:57:56.882818"
+ "timestamp": "2024-07-30T13:46:56.841519"
}
}
\ No newline at end of file
diff --git a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py
index e09ab78..84d23fc 100755
--- a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py
+++ b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py
@@ -40,10 +40,18 @@ def main():
help="sample_id used to limit neoantigen prediction to identify mutations "
"associated with the patient in the MAF (column 16). ",
)
- required_arguments.add_argument("--output_dir", required=True, help="output directory")
- required_arguments.add_argument("--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)")
- required_arguments.add_argument("--CDS_file", required=True, help="expects a fa.gz file")
- required_arguments.add_argument("--CDNA_file", required=True, help="expects a fa.gz file")
+ required_arguments.add_argument(
+ "--output_dir", required=True, help="output directory"
+ )
+ required_arguments.add_argument(
+ "--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)"
+ )
+ required_arguments.add_argument(
+ "--CDS_file", required=True, help="expects a fa.gz file"
+ )
+ required_arguments.add_argument(
+ "--CDNA_file", required=True, help="expects a fa.gz file"
+ )
optional_arguments = parser.add_argument_group("Optional arguments")
optional_arguments.add_argument(
@@ -51,7 +59,9 @@ def main():
required=False,
help="comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10",
)
- optional_arguments.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION))
+ optional_arguments.add_argument(
+ "-v", "--version", action="version", version="%(prog)s {}".format(VERSION)
+ )
args = parser.parse_args()
@@ -74,7 +84,9 @@ def main():
#
logger = logging.getLogger("neoantigen")
logger.setLevel(logging.DEBUG)
- console_formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S")
+ console_formatter = logging.Formatter(
+ "%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S"
+ )
# logfile handler
handler_file = logging.FileHandler(output_dir + "/neoantigen_run.log", mode="w")
@@ -103,7 +115,9 @@ def main():
logger.info("Finished loading reference CDS/cDNA sequences...")
logger.info("Reading MAF file and constructing mutated peptides...")
- maf_df = skip_lines_start_with(maf_file, "#", low_memory=False, header=0, sep="\t")
+ maf_df = skip_lines_start_with(
+ maf_file, "#", low_memory=False, header=0, sep="\t"
+ )
n_muts = n_non_syn_muts = n_missing_tx_id = 0
for index, row in maf_df.iterrows():
cds_seq = ""
@@ -148,7 +162,13 @@ def main():
logger.info("\tMAF mutations summary")
logger.info("\t\t# mutations: " + str(n_muts))
- logger.info("\t\t# non-syn: " + str(n_non_syn_muts) + " (# with missing CDS: " + str(n_missing_tx_id) + ")")
+ logger.info(
+ "\t\t# non-syn: "
+ + str(n_non_syn_muts)
+ + " (# with missing CDS: "
+ + str(n_missing_tx_id)
+ + ")"
+ )
except Exception:
logger.error("Error while generating mutated peptides")
@@ -263,12 +283,18 @@ def get_weak_binders(self):
return [x for x in self.get_best_per_icore() if x.is_weak_binder()]
def get_all_binders(self):
- return [x for x in self.get_best_per_icore() if x.is_strong_binder() or x.is_weak_binder()]
+ return [
+ x
+ for x in self.get_best_per_icore()
+ if x.is_strong_binder() or x.is_weak_binder()
+ ]
def get_best_binder(self):
if len(self.get_best_per_icore()) == 0:
return None
- return sorted(self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False)[0]
+ return sorted(
+ self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False
+ )[0]
#
@@ -401,7 +427,13 @@ def __init__(self, maf_row, cds_seq, cdna_seq):
)
else:
- self.identifier_key = str(self.maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code
+ self.identifier_key = (
+ str(self.maf_row["Chromosome"])
+ + encoded_position
+ + "_"
+ + "SY"
+ + Allele2code
+ )
print(self.identifier_key)
### Check if the variant_classification is among those that can generate a neoantigen
@@ -415,7 +447,9 @@ def is_non_syn(self):
"Nonstop_Mutation",
]
- return self.maf_row["Variant_Classification"] in types and not pd.isnull(self.maf_row["HGVSp_Short"])
+ return self.maf_row["Variant_Classification"] in types and not pd.isnull(
+ self.maf_row["HGVSp_Short"]
+ )
### helper function #source: stackoverflow.
def reverse_complement(self, dna):
@@ -522,13 +556,19 @@ def generate_translated_sequences(self, pad_len=10):
position, ref_allele, alt_allele, sequence, hgvsc_type = [-1, "", "", "", "ONP"]
if re.match(r"^c\.(\d+).*([ATCG]+)>([ATCG]+)$", hgvsc):
- position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*(\w+)>(\w+)", hgvsc).groups()
+ position, ref_allele, alt_allele = re.match(
+ r"^c\.(\d+).*(\w+)>(\w+)", hgvsc
+ ).groups()
elif re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc):
- position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc).groups()
+ position, ref_allele, alt_allele = re.match(
+ r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc
+ ).groups()
elif re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc):
- position, hgvsc_type, sequence = re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc).groups()
+ position, hgvsc_type, sequence = re.match(
+ r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc
+ ).groups()
else:
sys.exit("Error: not one of the known HGVSc strings: " + hgvsc)
@@ -572,7 +612,10 @@ def generate_translated_sequences(self, pad_len=10):
mt_rev = mt[::-1]
for i in range(0, min(len(wt), len(mt))):
len_from_end = i
- if len_from_end + len_from_start >= min(len(wt), len(mt)) or wt_rev[i : i + 1] != mt_rev[i : i + 1]:
+ if (
+ len_from_end + len_from_start >= min(len(wt), len(mt))
+ or wt_rev[i : i + 1] != mt_rev[i : i + 1]
+ ):
break
wt_start = len_from_start
@@ -584,8 +627,12 @@ def generate_translated_sequences(self, pad_len=10):
self.wt_aa = wt
self.mt_aa = mt
- self.wt_altered_aa = wt[max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1)]
- self.mt_altered_aa = mt[max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1)]
+ self.wt_altered_aa = wt[
+ max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1)
+ ]
+ self.mt_altered_aa = mt[
+ max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1)
+ ]
# function to iterate over all the the neopeptide predictions made in the entire MAF and identify
# which neopeptides are generated by this mutation object
@@ -617,7 +664,9 @@ def get_maf_row_to_print(self):
row["neo_best_algorithm"] = best_binder.algorithm
row["neo_best_version"] = best_binder.version
row["neo_best_hla_allele"] = best_binder.hla_allele
- row["neo_n_peptides_evaluated"] = len(self.predicted_neopeptides.get_best_per_icore())
+ row["neo_n_peptides_evaluated"] = len(
+ self.predicted_neopeptides.get_best_per_icore()
+ )
row["neo_n_strong_binders"] = len(strong_binders)
row["neo_n_weak_binders"] = len(weak_binders)
else:
diff --git a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py
index c980561..5aa3e09 100755
--- a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py
+++ b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py
@@ -9,8 +9,8 @@
VERSION = 1.7
-
def main(args):
+
def makeChild(subTree, start):
if start:
subTree = 0
@@ -37,10 +37,16 @@ def makeChild(subTree, start):
pass
else:
for ssm in treefile["mut_assignments"][str(subTree)]["ssms"]:
- ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"])
+ ssmli.append(
+ chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]
+ )
newsubtree["clone_mutations"] = ssmli
- newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0]
- newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0]
+ newsubtree["X"] = trees[tree]["populations"][str(subTree)][
+ "cellular_prevalence"
+ ][0]
+ newsubtree["x"] = trees[tree]["populations"][str(subTree)][
+ "cellular_prevalence"
+ ][0]
newsubtree["new_x"] = 0.0
except Exception as e:
print("Error in adding new subtree. Error not in base case**")
@@ -59,15 +65,21 @@ def makeChild(subTree, start):
try:
ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"])
except Exception as e:
- print("Error in appending to mutation list. Error in base case appending ssm to ssmli")
+ print(
+ "Error in appending to mutation list. Error in base case appending ssm to ssmli"
+ )
print(e)
# print(str(subTree))
pass
try:
newsubtree["clone_mutations"] = ssmli
- newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0]
- newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0]
+ newsubtree["X"] = trees[tree]["populations"][str(subTree)][
+ "cellular_prevalence"
+ ][0]
+ newsubtree["x"] = trees[tree]["populations"][str(subTree)][
+ "cellular_prevalence"
+ ][0]
newsubtree["new_x"] = 0.0
except Exception as e:
print("Error in adding new subtree. Error in base case")
@@ -93,7 +105,7 @@ def makeChild(subTree, start):
for index, row in mafdf.iterrows():
if (
- # We
+ #We
row["Variant_Type"] == "SNP"
or row["Variant_Type"] == "DEL"
or row["Variant_Type"] == "INS"
@@ -343,13 +355,13 @@ def find_first_difference_index(str1, str2):
# This is used as last resort for the matching. We will preferentially find the peptide matching in length as well as POS. Worst case we will default to the WT pos 0
if noposID not in WTdict:
WTdict[noposID] = {
- "peptides": {row_WT["peptide"]: id}, # This is a dict so we can match the peptide with the ID later
- "affinity": row_WT["affinity"],
+ 'peptides' : {row_WT["peptide"]:id}, #This is a dict so we can match the peptide with the ID later
+ "affinity": row_WT["affinity"]
}
else:
# print(WTdict[noposID]['peptides'])
- WTdict[noposID]["peptides"][row_WT["peptide"]] = id
+ WTdict[noposID]['peptides'][row_WT["peptide"]]=id
def find_most_similar_string(target, strings):
max_score = -1
@@ -375,7 +387,7 @@ def find_most_similar_string(target, strings):
max_score2 = score
most_similar_string2 = s
- if target[0] == s[0]:
+ if target[0]==s[0]:
if score > first_AA_same_score:
first_AA_same = s
first_AA_same_score = score
@@ -383,13 +395,13 @@ def find_most_similar_string(target, strings):
return most_similar_string, most_similar_string2, first_AA_same, first_AA_same_score, max_score
for index_mut, row_mut in neoantigen_mut_in.iterrows():
- IDsplit = row_mut["Identity"].split("_")
- if row_mut["affinity"] < 500:
+ IDsplit = row_mut["Identity"].split('_')
+ if row_mut["affinity"]< 500:
peplen = len(row_mut["peptide"])
matchfound = False
- IDsplit = row_mut["Identity"].split("_")
- if IDsplit[1][0] == "S" and IDsplit[1][1] != "p":
- # If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp
+ IDsplit = row_mut["Identity"].split('_')
+ if (IDsplit[1][0] == "S" and IDsplit[1][1] != 'p') :
+ #If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp
continue
# first find match in WT
WTid = (
@@ -410,7 +422,7 @@ def find_most_similar_string(target, strings):
+ row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "")
)
- if WTid in WTdict and ("M" == IDsplit[1][0] and "Sp" not in row_mut["Identity"]):
+ if WTid in WTdict and ('M' == IDsplit[1][0] and 'Sp' not in row_mut["Identity"]):
# match
matchfound = True
best_pepmatch = WTdict[WTid]["peptide"]
@@ -424,31 +436,28 @@ def find_most_similar_string(target, strings):
# print(mutation_dict[row_mut["Identity"]])
else:
- (
- best_pepmatch,
- best_pepmatch2,
- first_AA_same,
- first_AA_same_score,
- match_score,
- ) = find_most_similar_string(row_mut["peptide"], list(WTdict[noposID]["peptides"].keys()))
+ best_pepmatch,best_pepmatch2 , first_AA_same, first_AA_same_score, match_score = find_most_similar_string(row_mut["peptide"],list(WTdict[noposID]['peptides'].keys()))
if best_pepmatch == row_mut["peptide"]:
- # it seems this can happen where the row_mut is actually the canonical sequence.
+ #it seems this can happen where the row_mut is actually the canonical sequence.
# In this case we don't want to report the peptide as a neoantigen, its not neo
continue
- elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or (
- best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1]
- ):
+ elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or (best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1]):
# We should preferentially match the first AA if we can. I have found that the pairwise alignment isnt always the best at this.
# It will also do this when the last AA of the best match doesnt match but the last A of the second best match does
best_pepmatch = best_pepmatch2
- WTid = WTdict[noposID]["peptides"][best_pepmatch]
- matchfound = True
+ WTid = WTdict[noposID]['peptides'][best_pepmatch]
+ matchfound=True
if matchfound == True:
- mut_pos = find_first_difference_index(row_mut["peptide"], best_pepmatch) + 1 # WTdict[WTid]["peptide"]
+ mut_pos = (
+ find_first_difference_index(
+ row_mut["peptide"], best_pepmatch #WTdict[WTid]["peptide"]
+ )
+ + 1
+ )
neo_dict = {
"id": row_mut["Identity"]
@@ -461,7 +470,7 @@ def find_most_similar_string(target, strings):
"mutation_id": mutation_dict[row_mut["Identity"]],
"HLA_gene_id": row_mut["MHC"],
"sequence": row_mut["peptide"],
- "WT_sequence": best_pepmatch, # WTdict[WTid]["peptide"],
+ "WT_sequence": best_pepmatch ,#WTdict[WTid]["peptide"],
"mutated_position": mut_pos,
"Kd": float(row_mut["affinity"]),
"KdWT": float(WTdict[WTid]["affinity"]),
@@ -514,7 +523,7 @@ def makeID(maf_row):
"Frame_shift_Del": "I-",
"In_Frame_Ins": "If",
"In_Frame_Del": "Id",
- "Splice_Site": "Sp",
+ "Splice_Site": "Sp"
}
position = int(str(maf_row["Start_Position"])[0:2])
@@ -564,15 +573,26 @@ def makeID(maf_row):
)
else:
- identifier_key = str(maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code + "_M"
+ identifier_key = (
+ str(maf_row["Chromosome"])
+ + encoded_position
+ + "_"
+ + "SY"
+ + Allele2code
+ + "_M"
+ )
return identifier_key
def parse_args():
parser = argparse.ArgumentParser(description="Process input files and parameters")
parser.add_argument("--maf_file", required=True, help="Path to the MAF file")
- parser.add_argument("--summary_file", required=True, help="Path to the summary file")
- parser.add_argument("--mutation_file", required=True, help="Path to the mutation file")
+ parser.add_argument(
+ "--summary_file", required=True, help="Path to the summary file"
+ )
+ parser.add_argument(
+ "--mutation_file", required=True, help="Path to the mutation file"
+ )
parser.add_argument(
"--tree_directory",
required=True,
@@ -581,7 +601,9 @@ def parse_args():
parser.add_argument("--id", required=True, help="ID")
parser.add_argument("--patient_id", required=True, help="Patient ID")
parser.add_argument("--cohort", required=True, help="Cohort")
- parser.add_argument("--HLA_genes", required=True, help="Path to the file containing HLA genes")
+ parser.add_argument(
+ "--HLA_genes", required=True, help="Path to the file containing HLA genes"
+ )
parser.add_argument(
"--netMHCpan_MUT_input",
required=True,
@@ -597,7 +619,9 @@ def parse_args():
"--patient_data_file",
help="Path to the optional file containing status, overall survival, and PFS",
)
- parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION))
+ parser.add_argument(
+ "-v", "--version", action="version", version="%(prog)s {}".format(VERSION)
+ )
return parser.parse_args()
diff --git a/modules/msk/netmhcpan/main.nf b/modules/msk/netmhcpan/main.nf
index 4f03154..bc54431 100644
--- a/modules/msk/netmhcpan/main.nf
+++ b/modules/msk/netmhcpan/main.nf
@@ -25,14 +25,25 @@ process NETMHCPAN {
output_meta = meta.clone()
output_meta.typeMut = inputType == "MUT" ? true : false
output_meta.fromStab = false
+ def NETMHCPAN_VERSION = "4.1"
"""
- /usr/local/bin/netMHCpan-4.1/netMHCpan -s 0 -BA 1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 -xls -xlsfile ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output
+ /usr/local/bin/netMHCpan-${NETMHCPAN_VERSION}/netMHCpan \
+ -s 0 \
+ -BA 1 \
+ -f ${inputFasta} \
+ -a ${hla} \
+ -l 9,10 \
+ -inptype 0 \
+ -xls \
+ ${args} \
+ -xlsfile \
+ ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- netmhcpan: v4.1
+ netmhcpan: v${NETMHCPAN_VERSION}
END_VERSIONS
"""
@@ -40,6 +51,7 @@ process NETMHCPAN {
stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
+ def NETMHCPAN_VERSION = "4.1"
output_meta = meta.clone()
output_meta.typeMut = inputType == "MUT" ? true : false
output_meta.fromStab = false
@@ -49,7 +61,7 @@ process NETMHCPAN {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- netmhcpan: v4.1
+ netmhcpan: v${NETMHCPAN_VERSION}
END_VERSIONS
"""
}
diff --git a/modules/msk/netmhcstabpan/main.nf b/modules/msk/netmhcstabpan/main.nf
index df5b61a..316195f 100644
--- a/modules/msk/netmhcstabpan/main.nf
+++ b/modules/msk/netmhcstabpan/main.nf
@@ -25,14 +25,23 @@ process NETMHCSTABPAN {
output_meta = meta.clone()
output_meta.typeMut = inputType == "MUT" ? true : false
output_meta.fromStab = true
+
+ def NETMHCPAN_VERSION = "4.1"
+ def NETMHCSTABPAN_VERSION = "1.0"
+
"""
- /usr/local/bin/netMHCstabpan-1.0/netMHCstabpan -s -1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output
+ /usr/local/bin/netMHCstabpan-${NETMHCSTABPAN_VERSION}/netMHCstabpan \
+ -s -1 \
+ -f ${inputFasta} \
+ -a ${hla} \
+ -l 9,10 \
+ -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- netmhcpan: v4.1
- netmhcstabpan: v1.0
+ netmhcpan: v${NETMHCPAN_VERSION}
+ netmhcstabpan: v${NETMHCSTABPAN_VERSION}
END_VERSIONS
"""
@@ -43,14 +52,17 @@ process NETMHCSTABPAN {
output_meta = meta.clone()
output_meta.typeMut = inputType == "MUT" ? true : false
output_meta.fromStab = true
+ def NETMHCPAN_VERSION = "4.1"
+ def NETMHCSTABPAN_VERSION = "1.0"
+
"""
touch ${prefix}.MUT.netmhcstabpan.output
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- netmhcpan: v4.1
- netmhcstabpan: v1.0
+ netmhcpan: v${NETMHCPAN_VERSION}
+ netmhcstabpan: v${NETMHCSTABPAN_VERSION}
END_VERSIONS
"""
}
diff --git a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py
index b565290..e15092a 100755
--- a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py
+++ b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py
@@ -22,7 +22,9 @@ class ReadCountsUnavailableError(Exception):
class MafVariant(object):
- def __init__(self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele):
+ def __init__(
+ self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele
+ ):
self.CHROM = chromosome
self.POS = position
self.FILTER = filter
@@ -105,8 +107,12 @@ def _get_tumor_index(self, variant, tumor_sample=None):
Might not always be true
"""
if self._tumor_sample:
- tumor_is = [i for i, s in enumerate(variant.samples) if s.sample == tumor_sample]
- assert len(tumor_is) == 1, "Did not find tumor name %s in samples" % tumor_sample
+ tumor_is = [
+ i for i, s in enumerate(variant.samples) if s.sample == tumor_sample
+ ]
+ assert len(tumor_is) == 1, (
+ "Did not find tumor name %s in samples" % tumor_sample
+ )
return tumor_is[0]
else:
# Don't make this -1, as some code assumes it will be >= 0.
@@ -146,10 +152,14 @@ def _calc_read_counts(self, variant):
},
}
- ref_reads = tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt]
+ ref_reads = (
+ tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt]
+ )
# For now, variant reads are defined as only the non-reference nucleotide in
# the inferred tumor SNP. We ignore reads of a third or fourth base.
- variant_reads = tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt]
+ variant_reads = (
+ tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt]
+ )
total_reads = ref_reads + variant_reads
return (ref_reads, total_reads)
@@ -235,7 +245,9 @@ def _calc_read_counts(self, variant):
total_reads = 0
variant_reads = 0
else:
- variant_reads = int(getattr(variant.samples[tumor_i].data, str(alt) + "U")[0])
+ variant_reads = int(
+ getattr(variant.samples[tumor_i].data, str(alt) + "U")[0]
+ )
ref_reads = total_reads - variant_reads
return (ref_reads, total_reads)
@@ -299,6 +311,7 @@ def _calc_read_counts(self, variant):
class MafParser(MutectSmchetParser):
+
def _calc_read_counts(self, variant):
total_reads = variant.T_REF + variant.T_ALT
return (variant.T_REF, total_reads)
@@ -323,7 +336,9 @@ def _parse_maf(self, maf_filename):
pos = start
if variant_type == "DEL":
pos = pos - 1
- variant = MafVariant(chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele)
+ variant = MafVariant(
+ chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele
+ )
variant_list.append(variant)
return variant_list
@@ -433,7 +448,9 @@ def _format_cnvs(self, cnvs, variants):
for chrom, chrom_cnvs in cnvs.items():
for cnv in chrom_cnvs:
- overlapping_variants = self._find_overlapping_variants(chrom, cnv, variants)
+ overlapping_variants = self._find_overlapping_variants(
+ chrom, cnv, variants
+ )
total_reads = self._calc_total_reads(cnv["start"], cnv["end"])
ref_reads = self._calc_ref_reads(cnv["cell_prev"], total_reads)
yield {
@@ -477,12 +494,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity):
for K in ("chrom", "start", "end", "major_cn", "minor_cn"):
physical_cnvs[K] = cnv[K]
- assert len(set(physical_cnvs["major_cn"])) == len(set(physical_cnvs["major_cn"])) == 1
+ assert (
+ len(set(physical_cnvs["major_cn"]))
+ == len(set(physical_cnvs["major_cn"]))
+ == 1
+ )
physical_cnvs["major_cn"] = physical_cnvs["major_cn"][0]
physical_cnvs["minor_cn"] = physical_cnvs["minor_cn"][0]
- physical_cnvs["cell_prev"] = "|".join([str(C) for C in cnv["cellular_prevalence"]])
- cnv["physical_cnvs"] = ",".join(["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()])
+ physical_cnvs["cell_prev"] = "|".join(
+ [str(C) for C in cnv["cellular_prevalence"]]
+ )
+ cnv["physical_cnvs"] = ",".join(
+ ["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()]
+ )
merged, formatted = formatted[:1], formatted[1:]
merged[0]["cnv_id"] = "c0"
@@ -496,13 +521,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity):
# Only merge CNVs if they're clonal. If they're subclonal, leave them
# free to move around the tree.
- if np.array_equal(current["cellular_prevalence"], last["cellular_prevalence"]) and np.array_equal(
- last["cellular_prevalence"], cellularity
- ):
+ if np.array_equal(
+ current["cellular_prevalence"], last["cellular_prevalence"]
+ ) and np.array_equal(last["cellular_prevalence"], cellularity):
# Merge the CNVs.
- log("Merging %s_%s and %s_%s" % (current["chrom"], current["start"], last["chrom"], last["start"]))
- last["total_reads"] = self._cap_cnv_D(current["total_reads"] + last["total_reads"])
- last["ref_reads"] = self._calc_ref_reads(last["cellular_prevalence"], last["total_reads"])
+ log(
+ "Merging %s_%s and %s_%s"
+ % (current["chrom"], current["start"], last["chrom"], last["start"])
+ )
+ last["total_reads"] = self._cap_cnv_D(
+ current["total_reads"] + last["total_reads"]
+ )
+ last["ref_reads"] = self._calc_ref_reads(
+ last["cellular_prevalence"], last["total_reads"]
+ )
last["physical_cnvs"] += ";" + current["physical_cnvs"]
self._merge_variants(last, current)
else:
@@ -534,7 +566,9 @@ def _calc_ref_freq(self, ref_genotype, error_rate):
raise Exception("Nonsensical frequency: %s" % freq)
return freq
- def format_variants(self, variants, ref_read_counts, total_read_counts, error_rate, sex):
+ def format_variants(
+ self, variants, ref_read_counts, total_read_counts, error_rate, sex
+ ):
for variant_idx, variant in enumerate(variants):
ssm_id = "s%s" % self._counter
if hasattr(variant, "ID") and variant.ID is not None:
@@ -649,7 +683,10 @@ def _create_intervals(self, cnv_set):
# some intervals revealed that the code crashes on this input. We
# should provide a more informative error in such cases, which the
# following assertion does.
- assert cnv["start"] < cnv["end"], "In CNV %s, start position occurs at or after the end position" % cnv
+ assert cnv["start"] < cnv["end"], (
+ "In CNV %s, start position occurs at or after the end position"
+ % cnv
+ )
start_pos = [
(
@@ -670,7 +707,9 @@ def _create_intervals(self, cnv_set):
# True > False, so this sorting will place start positions after end
# positions if both have same coordinate.
- positions = sorted(start_pos + end_pos, key=lambda e: (e[0], e[1] == "start"))
+ positions = sorted(
+ start_pos + end_pos, key=lambda e: (e[0], e[1] == "start")
+ )
assert len(positions) >= 2, "Fewer than two positions in %s" % positions
# prev_pos is updated each time we move to a new coordinate on the
@@ -701,7 +740,9 @@ def _create_intervals(self, cnv_set):
assert locus > prev_pos
interval = (prev_pos, locus)
if interval[1] - interval[0] > min_size_for_inclusion:
- intervals[chrom].append((interval[0], interval[1], sorted(list(set(open_samples)))))
+ intervals[chrom].append(
+ (interval[0], interval[1], sorted(list(set(open_samples))))
+ )
else:
# All points should be start points.
assert set([i[1] for i in points_at_locus]) == set(["start"])
@@ -831,7 +872,9 @@ def _get_abnormal_state_for_all_samples(self, chrom, cnv):
abnormal_state = None
filtered = []
- for sampidx, cell_prev, major, minor in zip(cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"]):
+ for sampidx, cell_prev, major, minor in zip(
+ cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"]
+ ):
# Region may be (clonal or subclonal) normal in a sample, so ignore such records.
if self._is_region_normal_cn(chrom, major, minor):
continue
@@ -908,12 +951,15 @@ def load_single_abnormal_state_cnvs(self):
if not is_good_chrom(chrom):
continue
for cnv in chrom_cnvs:
- states_for_all_samples = self._get_abnormal_state_for_all_samples(chrom, cnv)
+ states_for_all_samples = self._get_abnormal_state_for_all_samples(
+ chrom, cnv
+ )
if states_for_all_samples is None:
continue
combined_states = {
- K: np.array([S[K] for S in states_for_all_samples]) for K in states_for_all_samples[0].keys()
+ K: np.array([S[K] for S in states_for_all_samples])
+ for K in states_for_all_samples[0].keys()
}
cnv.update(combined_states)
abnormal_cnvs[chrom].append(cnv)
@@ -932,7 +978,9 @@ def load_normal_cnvs(self):
if not is_good_chrom(chrom):
continue
for cnv in chrom_cnvs:
- if not self._is_multisample_region_normal_cn(chrom, cnv["major_cn"], cnv["minor_cn"]):
+ if not self._is_multisample_region_normal_cn(
+ chrom, cnv["major_cn"], cnv["minor_cn"]
+ ):
continue
if not set(cnv["sampidx"]) == self.sampidxs:
continue
@@ -1081,7 +1129,9 @@ def retain_only_variants_in_normal_cn_regions(self):
raise Exception("CN regions not yet provided")
normal_cn = self._multisamp_cnv.load_normal_cnvs()
- filtered = self._filter_variants_outside_regions(normal_cn, "all_variants", "only_normal_cn")
+ filtered = self._filter_variants_outside_regions(
+ normal_cn, "all_variants", "only_normal_cn"
+ )
def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self):
# Battenberg:
@@ -1104,9 +1154,13 @@ def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self):
# If variant isn't listed in *any* region: exclude (as we suspect CNV
# caller didn't know what to do with the region).
- self._filter_variants_outside_regions(self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions")
+ self._filter_variants_outside_regions(
+ self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions"
+ )
- def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, sex):
+ def format_variants(
+ self, sample_size, error_rate, priority_ssms, only_priority, sex
+ ):
if sample_size is None:
sample_size = len(self._variant_idxs)
random.shuffle(self._variant_idxs)
@@ -1135,7 +1189,11 @@ def format_variants(self, sample_size, error_rate, priority_ssms, only_priority,
else:
nonsubsampled.append(variant_idx)
- assert len(used_variant_idxs) == len(self._variant_idxs) == len(subsampled) + len(nonsubsampled)
+ assert (
+ len(used_variant_idxs)
+ == len(self._variant_idxs)
+ == len(subsampled) + len(nonsubsampled)
+ )
subsampled.sort(key=lambda idx: variant_key(self._variants[idx]))
subsampled_variants = get_elements_at_indices(self._variants, subsampled)
@@ -1174,7 +1232,9 @@ def write_variants(self, variants, outfn):
print("\t".join(("id", "gene", "a", "d", "mu_r", "mu_v")), file=outf)
for variant in variants:
variant["ref_reads"] = ",".join([str(v) for v in variant["ref_reads"]])
- variant["total_reads"] = ",".join([str(v) for v in variant["total_reads"]])
+ variant["total_reads"] = ",".join(
+ [str(v) for v in variant["total_reads"]]
+ )
vals = (
"ssm_id",
"variant_name",
@@ -1190,7 +1250,10 @@ def _estimate_read_depth(self):
read_sum = 0
if len(self._variants) == 0:
default_read_depth = 50
- log("No variants available, so fixing read depth at %s." % default_read_depth)
+ log(
+ "No variants available, so fixing read depth at %s."
+ % default_read_depth
+ )
return default_read_depth
else:
return np.nanmedian(self._total_read_counts, axis=0)
@@ -1198,7 +1261,9 @@ def _estimate_read_depth(self):
def write_cnvs(self, variants, outfn):
with open(outfn, "w") as outf:
print("\t".join(("cnv", "a", "d", "ssms", "physical_cnvs")), file=outf)
- formatter = CnvFormatter(self._estimated_read_depth, self._sampidxs, self._hetsnp_rate)
+ formatter = CnvFormatter(
+ self._estimated_read_depth, self._sampidxs, self._hetsnp_rate
+ )
for cnv in formatter.format_and_merge_cnvs(
self._multisamp_cnv.load_single_abnormal_state_cnvs(),
variants,
@@ -1298,7 +1363,9 @@ def impute_missing_total_reads(total_reads, missing_variant_confidence):
assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0
# Convert 1D arrays to vectors to permit matrix multiplication.
- imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)))
+ imputed_counts = np.dot(
+ variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1))
+ )
nan_coords = np.where(np.isnan(total_reads))
total_reads[nan_coords] = imputed_counts[nan_coords]
assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0
@@ -1331,7 +1398,9 @@ def is_good_chrom(chrom):
return False
-def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence):
+def parse_variants(
+ samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence
+):
parsed_variants = []
all_variant_ids = []
num_samples = len(samples)
@@ -1376,7 +1445,11 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_
)
)
else:
- variant_ids.append(VariantId(str(single_variant[0].CHROM), int(single_variant[0].POS), None))
+ variant_ids.append(
+ VariantId(
+ str(single_variant[0].CHROM), int(single_variant[0].POS), None
+ )
+ )
all_variant_ids += variant_ids
all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates.
@@ -1398,7 +1471,9 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_
ref_read_counts[variant_idx, sample_idx] = ref_reads
total_read_counts[variant_idx, sample_idx] = total_reads
- total_read_counts = impute_missing_total_reads(total_read_counts, missing_variant_confidence)
+ total_read_counts = impute_missing_total_reads(
+ total_read_counts, missing_variant_confidence
+ )
ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts)
return (all_variant_ids, ref_read_counts, total_read_counts)
@@ -1411,7 +1486,9 @@ def infer_sex(variant_ids):
return "female"
-def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples):
+def extract_sample_data(
+ vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples
+):
vcf_files = {}
vcf_types = {}
cnv_files = {}
@@ -1426,7 +1503,9 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_
should_use_cnvs = cnv_files_and_samples is not None
if should_use_cnvs:
- assert len(cnv_files_and_samples) == len(vcf_files_and_samples), "Must specify same number of VCF and CNV files"
+ assert len(cnv_files_and_samples) == len(
+ vcf_files_and_samples
+ ), "Must specify same number of VCF and CNV files"
srcs_and_dsts.append((cnv_files_and_samples, cnv_files))
for src, dst in srcs_and_dsts:
@@ -1436,20 +1515,28 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_
dst[sample] = val
# Sample order will dictate eventual output order.
- common_samps = reduce(lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts])
+ common_samps = reduce(
+ lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts]
+ )
ordered_samps = [S.split("=", 1)[0] for S in vcf_files_and_samples]
assert len(ordered_samps) == len(common_samps) # Ensure no duplicates.
- assert set(vcf_files.keys()) == common_samps, "VCF file samples (%s) differ from common samples (%s)" % (
+ assert (
+ set(vcf_files.keys()) == common_samps
+ ), "VCF file samples (%s) differ from common samples (%s)" % (
vcf_files.keys(),
common_samps,
)
- assert set(vcf_types.keys()) == common_samps, "VCF type samples (%s) differ from common samples (%s)" % (
+ assert (
+ set(vcf_types.keys()) == common_samps
+ ), "VCF type samples (%s) differ from common samples (%s)" % (
vcf_types.keys(),
common_samps,
)
if should_use_cnvs:
- assert set(cnv_files.keys()) == common_samps, "CNV file samples (%s) differ from CNV file samples (%s)" % (
+ assert (
+ set(cnv_files.keys()) == common_samps
+ ), "CNV file samples (%s) differ from CNV file samples (%s)" % (
cnv_files.keys(),
common_samps,
)
@@ -1600,7 +1687,9 @@ def main():
log.verbose = args.verbose
params = {}
- samples, vcf_files, vcf_types, cnv_files = extract_sample_data(args.vcf_files, args.vcf_types, args.cnv_files)
+ samples, vcf_files, vcf_types, cnv_files = extract_sample_data(
+ args.vcf_files, args.vcf_types, args.cnv_files
+ )
params["samples"], params["vcf_files"], params["vcf_types"], params["cnv_files"] = (
samples,
vcf_files,
@@ -1634,7 +1723,9 @@ def main():
grouper.add_cnvs(cn_regions, sex)
if not grouper.has_cnvs():
- assert args.regions == "all", "If you do not provide CNA data, you must specify --regions=all"
+ assert (
+ args.regions == "all"
+ ), "If you do not provide CNA data, you must specify --regions=all"
if args.regions == "normal_cn":
grouper.retain_only_variants_in_normal_cn_regions()
@@ -1660,8 +1751,13 @@ def main():
if grouper.has_cnvs() and args.regions != "normal_cn":
# Write CNVs.
grouper.write_cnvs(subsampled_vars, args.output_cnvs)
- if args.output_nonsubsampled_variants and args.output_nonsubsampled_variants_cnvs:
- grouper.write_cnvs(nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs)
+ if (
+ args.output_nonsubsampled_variants
+ and args.output_nonsubsampled_variants_cnvs
+ ):
+ grouper.write_cnvs(
+ nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs
+ )
else:
# Write empty CNV file.
with open(args.output_cnvs, "w"):
diff --git a/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py
index e8ac2c3..e1fdbdc 100755
--- a/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py
+++ b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py
@@ -131,7 +131,8 @@ def parse(self):
and str.isdigit(record["lcn.em"])
and str.isdigit(record["cf.em"].replace(".", "", 1))
):
- cnv["cellular_prevalence"] = float(record["cf.em"]) * self._cellularity
+ cnv["cellular_prevalence"] = (
+ float(record["cf.em"]))
cnv["minor_cn"] = int(record["lcn.em"])
cnv["major_cn"] = int(record["tcn.em"]) - cnv["minor_cn"]
chrom = record["chrom"]
@@ -181,7 +182,9 @@ def parse(self):
cnv1["end"] = end
cnv1["major_cn"] = int(fields[8 + self._field_offset])
cnv1["minor_cn"] = int(fields[9 + self._field_offset])
- cnv1["cellular_prevalence"] = float(fields[10 + self._field_offset]) * self._cellularity
+ cnv1["cellular_prevalence"] = (
+ float(fields[10 + self._field_offset]) * self._cellularity
+ )
cnv2 = None
# Stefan's comment on p values: The p-values correspond "to whether a
@@ -198,11 +201,15 @@ def parse(self):
cnv2["end"] = end
cnv2["major_cn"] = int(fields[11 + self._field_offset])
cnv2["minor_cn"] = int(fields[12 + self._field_offset])
- cnv2["cellular_prevalence"] = float(fields[13 + self._field_offset]) * self._cellularity
+ cnv2["cellular_prevalence"] = (
+ float(fields[13 + self._field_offset]) * self._cellularity
+ )
else:
cnv1["cellular_prevalence"] = self._cellularity
- if cnv1["start"] >= cnv1["end"] or (cnv2 is not None and cnv2["start"] >= cnv2["end"]):
+ if cnv1["start"] >= cnv1["end"] or (
+ cnv2 is not None and cnv2["start"] >= cnv2["end"]
+ ):
continue
cn_regions[chrom].append(cnv1)
@@ -258,7 +265,8 @@ def main():
if args.cellularity > 1.0:
print(
- "Cellularity for %s is %s. Setting to 1.0." % (args.cnv_file, args.cellularity),
+ "Cellularity for %s is %s. Setting to 1.0."
+ % (args.cnv_file, args.cellularity),
file=sys.stderr,
)
cellularity = 1.0
diff --git a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap
index 58351ec..7d9d8ad 100644
--- a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap
+++ b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap
@@ -8,7 +8,7 @@
"id": "test",
"single_end": false
},
- "cnvs.txt:md5,5ba8c1c06170c6d7ee9c18026b710154"
+ "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7"
]
],
"1": [
@@ -20,7 +20,7 @@
"id": "test",
"single_end": false
},
- "cnvs.txt:md5,5ba8c1c06170c6d7ee9c18026b710154"
+ "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7"
]
],
"versions": [
@@ -28,7 +28,7 @@
]
}
],
- "timestamp": "2024-06-11T18:02:28.983981"
+ "timestamp": "2024-07-08T12:15:00.257251"
},
"phylowgs_parsecnvs - txt - stub": {
"content": [
@@ -59,6 +59,6 @@
]
}
],
- "timestamp": "2024-06-11T18:02:36.26591"
+ "timestamp": "2024-07-08T12:15:19.111149"
}
}
\ No newline at end of file
diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml
deleted file mode 100644
index 1787b38..0000000
--- a/modules/nf-core/fastqc/environment.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: fastqc
-channels:
- - conda-forge
- - bioconda
- - defaults
-dependencies:
- - bioconda::fastqc=0.12.1
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
deleted file mode 100644
index d79f1c8..0000000
--- a/modules/nf-core/fastqc/main.nf
+++ /dev/null
@@ -1,61 +0,0 @@
-process FASTQC {
- tag "$meta.id"
- label 'process_medium'
-
- conda "${moduleDir}/environment.yml"
- container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' :
- 'biocontainers/fastqc:0.12.1--hdfd78af_0' }"
-
- input:
- tuple val(meta), path(reads)
-
- output:
- tuple val(meta), path("*.html"), emit: html
- tuple val(meta), path("*.zip") , emit: zip
- path "versions.yml" , emit: versions
-
- when:
- task.ext.when == null || task.ext.when
-
- script:
- def args = task.ext.args ?: ''
- def prefix = task.ext.prefix ?: "${meta.id}"
- // Make list of old name and new name pairs to use for renaming in the bash while loop
- def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] }
- def rename_to = old_new_pairs*.join(' ').join(' ')
- def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ')
-
- def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB')
- // FastQC memory value allowed range (100 - 10000)
- def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb)
-
- """
- printf "%s %s\\n" $rename_to | while read old_name new_name; do
- [ -f "\${new_name}" ] || ln -s \$old_name \$new_name
- done
-
- fastqc \\
- $args \\
- --threads $task.cpus \\
- --memory $fastqc_memory \\
- $renamed_files
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' )
- END_VERSIONS
- """
-
- stub:
- def prefix = task.ext.prefix ?: "${meta.id}"
- """
- touch ${prefix}.html
- touch ${prefix}.zip
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' )
- END_VERSIONS
- """
-}
diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml
deleted file mode 100644
index ee5507e..0000000
--- a/modules/nf-core/fastqc/meta.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: fastqc
-description: Run FastQC on sequenced reads
-keywords:
- - quality control
- - qc
- - adapters
- - fastq
-tools:
- - fastqc:
- description: |
- FastQC gives general quality metrics about your reads.
- It provides information about the quality score distribution
- across your reads, the per base sequence content (%A/C/G/T).
- You get information about adapter contamination and other
- overrepresented sequences.
- homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
- documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/
- licence: ["GPL-2.0-only"]
-input:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information
- e.g. [ id:'test', single_end:false ]
- - reads:
- type: file
- description: |
- List of input FastQ files of size 1 and 2 for single-end and paired-end data,
- respectively.
-output:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information
- e.g. [ id:'test', single_end:false ]
- - html:
- type: file
- description: FastQC report
- pattern: "*_{fastqc.html}"
- - zip:
- type: file
- description: FastQC report archive
- pattern: "*_{fastqc.zip}"
- - versions:
- type: file
- description: File containing software versions
- pattern: "versions.yml"
-authors:
- - "@drpatelh"
- - "@grst"
- - "@ewels"
- - "@FelixKrueger"
-maintainers:
- - "@drpatelh"
- - "@grst"
- - "@ewels"
- - "@FelixKrueger"
diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test
deleted file mode 100644
index 70edae4..0000000
--- a/modules/nf-core/fastqc/tests/main.nf.test
+++ /dev/null
@@ -1,212 +0,0 @@
-nextflow_process {
-
- name "Test Process FASTQC"
- script "../main.nf"
- process "FASTQC"
-
- tag "modules"
- tag "modules_nfcore"
- tag "fastqc"
-
- test("sarscov2 single-end [fastq]") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [ id: 'test', single_end:true ],
- [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it.
- // looks like this:
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039
-
- { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" },
- { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" },
- { assert path(process.out.html[0][1]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_single") }
- )
- }
- }
-
- test("sarscov2 paired-end [fastq]") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [id: 'test', single_end: false], // meta map
- [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ]
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" },
- { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" },
- { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" },
- { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" },
- { assert path(process.out.html[0][1][0]).text.contains("File type | Conventional base calls |
") },
- { assert path(process.out.html[0][1][1]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_paired") }
- )
- }
- }
-
- test("sarscov2 interleaved [fastq]") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [id: 'test', single_end: false], // meta map
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true)
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" },
- { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" },
- { assert path(process.out.html[0][1]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") }
- )
- }
- }
-
- test("sarscov2 paired-end [bam]") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [id: 'test', single_end: false], // meta map
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true)
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" },
- { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" },
- { assert path(process.out.html[0][1]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_bam") }
- )
- }
- }
-
- test("sarscov2 multiple [fastq]") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [id: 'test', single_end: false], // meta map
- [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true),
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true),
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ]
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" },
- { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" },
- { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" },
- { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" },
- { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" },
- { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" },
- { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" },
- { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" },
- { assert path(process.out.html[0][1][0]).text.contains("File type | Conventional base calls |
") },
- { assert path(process.out.html[0][1][1]).text.contains("File type | Conventional base calls |
") },
- { assert path(process.out.html[0][1][2]).text.contains("File type | Conventional base calls |
") },
- { assert path(process.out.html[0][1][3]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_multiple") }
- )
- }
- }
-
- test("sarscov2 custom_prefix") {
-
- when {
- process {
- """
- input[0] = Channel.of([
- [ id:'mysample', single_end:true ], // meta map
- file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
-
- { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" },
- { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" },
- { assert path(process.out.html[0][1]).text.contains("File type | Conventional base calls |
") },
-
- { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") }
- )
- }
- }
-
- test("sarscov2 single-end [fastq] - stub") {
-
- options "-stub"
-
- when {
- process {
- """
- input[0] = Channel.of([
- [ id: 'test', single_end:true ],
- [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
- ])
- """
- }
- }
-
- then {
- assertAll (
- { assert process.success },
- { assert snapshot(process.out.html.collect { file(it[1]).getName() } +
- process.out.zip.collect { file(it[1]).getName() } +
- process.out.versions ).match("fastqc_stub") }
- )
- }
- }
-
-}
diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap
deleted file mode 100644
index 86f7c31..0000000
--- a/modules/nf-core/fastqc/tests/main.nf.test.snap
+++ /dev/null
@@ -1,88 +0,0 @@
-{
- "fastqc_versions_interleaved": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:40:07.293713"
- },
- "fastqc_stub": {
- "content": [
- [
- "test.html",
- "test.zip",
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:31:01.425198"
- },
- "fastqc_versions_multiple": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:40:55.797907"
- },
- "fastqc_versions_bam": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:40:26.795862"
- },
- "fastqc_versions_single": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:39:27.043675"
- },
- "fastqc_versions_paired": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:39:47.584191"
- },
- "fastqc_versions_custom_prefix": {
- "content": [
- [
- "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
- ]
- ],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "23.10.1"
- },
- "timestamp": "2024-01-31T17:41:14.576531"
- }
-}
\ No newline at end of file
diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml
deleted file mode 100644
index 7834294..0000000
--- a/modules/nf-core/fastqc/tests/tags.yml
+++ /dev/null
@@ -1,2 +0,0 @@
-fastqc:
- - modules/nf-core/fastqc/**
diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml
index ecb7dd7..2121492 100644
--- a/modules/nf-core/multiqc/environment.yml
+++ b/modules/nf-core/multiqc/environment.yml
@@ -4,4 +4,4 @@ channels:
- bioconda
- defaults
dependencies:
- - bioconda::multiqc=1.22.3
+ - bioconda::multiqc=1.23
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
index 2581a49..459dfea 100644
--- a/modules/nf-core/multiqc/main.nf
+++ b/modules/nf-core/multiqc/main.nf
@@ -3,14 +3,16 @@ process MULTIQC {
conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/multiqc:1.22.3--pyhdfd78af_0' :
- 'biocontainers/multiqc:1.22.3--pyhdfd78af_0' }"
+ 'https://depot.galaxyproject.org/singularity/multiqc:1.23--pyhdfd78af_0' :
+ 'biocontainers/multiqc:1.23--pyhdfd78af_0' }"
input:
path multiqc_files, stageAs: "?/*"
path(multiqc_config)
path(extra_multiqc_config)
path(multiqc_logo)
+ path(replace_names)
+ path(sample_names)
output:
path "*multiqc_report.html", emit: report
@@ -26,6 +28,8 @@ process MULTIQC {
def config = multiqc_config ? "--config $multiqc_config" : ''
def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : ''
def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : ''
+ def replace = replace_names ? "--replace-names ${replace_names}" : ''
+ def samples = sample_names ? "--sample-names ${sample_names}" : ''
"""
multiqc \\
--force \\
@@ -33,6 +37,8 @@ process MULTIQC {
$config \\
$extra_config \\
$logo \\
+ $replace \\
+ $samples \\
.
cat <<-END_VERSIONS > versions.yml
diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml
index 45a9bc3..382c08c 100644
--- a/modules/nf-core/multiqc/meta.yml
+++ b/modules/nf-core/multiqc/meta.yml
@@ -29,6 +29,19 @@ input:
type: file
description: Optional logo file for MultiQC
pattern: "*.{png}"
+ - replace_names:
+ type: file
+ description: |
+ Optional two-column sample renaming file. First column a set of
+ patterns, second column a set of corresponding replacements. Passed via
+ MultiQC's `--replace-names` option.
+ pattern: "*.{tsv}"
+ - sample_names:
+ type: file
+ description: |
+ Optional TSV file with headers, passed to the MultiQC --sample_names
+ argument.
+ pattern: "*.{tsv}"
output:
- report:
type: file
diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test
index f1c4242..6aa27f4 100644
--- a/modules/nf-core/multiqc/tests/main.nf.test
+++ b/modules/nf-core/multiqc/tests/main.nf.test
@@ -17,6 +17,8 @@ nextflow_process {
input[1] = []
input[2] = []
input[3] = []
+ input[4] = []
+ input[5] = []
"""
}
}
@@ -41,6 +43,8 @@ nextflow_process {
input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true))
input[2] = []
input[3] = []
+ input[4] = []
+ input[5] = []
"""
}
}
@@ -66,6 +70,8 @@ nextflow_process {
input[1] = []
input[2] = []
input[3] = []
+ input[4] = []
+ input[5] = []
"""
}
}
diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap
index 0a4760e..45e95e5 100644
--- a/modules/nf-core/multiqc/tests/main.nf.test.snap
+++ b/modules/nf-core/multiqc/tests/main.nf.test.snap
@@ -2,14 +2,14 @@
"multiqc_versions_single": {
"content": [
[
- "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984"
+ "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19"
]
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
- "timestamp": "2024-06-25T12:31:21.878452033"
+ "timestamp": "2024-07-10T12:41:34.562023"
},
"multiqc_stub": {
"content": [
@@ -17,25 +17,25 @@
"multiqc_report.html",
"multiqc_data",
"multiqc_plots",
- "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984"
+ "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19"
]
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
- "timestamp": "2024-06-25T12:32:02.322196503"
+ "timestamp": "2024-07-10T11:27:11.933869532"
},
"multiqc_versions_config": {
"content": [
[
- "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984"
+ "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19"
]
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
- "timestamp": "2024-06-25T12:31:50.064227638"
+ "timestamp": "2024-07-10T11:26:56.709849369"
}
}
\ No newline at end of file
diff --git a/subworkflows/msk/netmhcstabandpan/main.nf b/subworkflows/msk/netmhcstabandpan/main.nf
index d0425ef..ea8247c 100644
--- a/subworkflows/msk/netmhcstabandpan/main.nf
+++ b/subworkflows/msk/netmhcstabandpan/main.nf
@@ -91,13 +91,5 @@ def createNETMHCInput(wt_fasta, mut_fasta, hla) {
new Tuple(it[1][0], it[1][1],it[2][1],"WT")
}
merged = merged_mut.mix(merged_wt)
-
-
-// merged = merged_mut
-// .join(merged_wt)
-// .map{
-// new Tuple(it[1][0],it[1][1],it[1][2],it[1][3])
-// }
-
return merged
}
diff --git a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap
index 0d3ce18..dbde65e 100644
--- a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap
+++ b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap
@@ -6,26 +6,15 @@
"test.MUT_sequences.fa:md5,7fdb7d3f0fe5a6f439ed294b612c2d70",
"test.WT_sequences.fa:md5,7595ed6cf0c98500b00c9ad027125b38"
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-11T16:27:38.189273"
+ "timestamp": "2024-07-30T13:48:55.729458"
},
"netmhcstabandpan - tsv,xls,fa - stub": {
"content": [
- [
- "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
- "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
- ],
+ "test.WT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
"test.MUT.xls",
"test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e",
"test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e"
],
- "meta": {
- "nf-test": "0.8.4",
- "nextflow": "24.04.2"
- },
- "timestamp": "2024-06-11T16:27:49.026711"
+ "timestamp": "2024-07-30T13:49:11.413783"
}
}
\ No newline at end of file