diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index aa0e6b2..08622fd 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -11,14 +11,14 @@ on: description: "The specific branch you wish to utilize for the test execution of nf-core download." required: true default: "dev" -# pull_request: -# types: -# - opened -# branches: -# - master -# pull_request_target: -# branches: -# - master + pull_request: + types: + - opened + branches: + - master + pull_request_target: + branches: + - master env: NXF_ANSI_LOG: false diff --git a/docs/images/luksza2021_fig3.png b/docs/images/luksza2021_fig3.png new file mode 100644 index 0000000..0c87548 Binary files /dev/null and b/docs/images/luksza2021_fig3.png differ diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e4..0000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb..0000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf..0000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index 7580a0e..012467e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,48 +12,95 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. Create phylogenetic trees using [PhyloWGS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8) +2. Use [netMHCpan-4](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) to calculate binding affinities +3. Use [netMHCpanStab](https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/) to calculate stability scores +4. Use Luksza et al.'s neoantigen quality and fitness computations tool ([NeoantigenEditing](https://github.com/LukszaLab/NeoantigenEditing)) to evaluate peptides -### FastQC +### PhyloWGS
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `phylowgs/` + - `*_.summ.json.gz`: Output file for JSON-formatted tree summaries + - `*.muts.json.gz`: Output file for JSON-formatted list of mutations + - `*.muts.json.gz`: Output file for JSON-formatted list of mutations + - `*.muts.json.gz`: Output zipped folder for JSON-formatted list of SSMs and CNVs
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +### netMHCpan -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) - -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `netmhcpan/` + - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens + - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +
-### MultiQC +### netMHCstabpan
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `netmhcstabpan/` + - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens + - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +### Neoantigen Ediitng Final Output + +
+Output files + +- `neoantigenediting/` + + - `*._annotated.json`: The final output of the pipeline. This file is an annotated version of the tree output from phyloWGS with an extra property titled 'neoantigens'. Each entry in 'neoantigens' is a property with properties describing the neoantigen. These neoantigen properities are described below + + "id": "XSYI_MG_M_9_C1203_11", + + "mutation_id": "X_72667534_C_G", + + "HLA_gene_id": "HLA-C\*12:03", -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + "sequence": "ASRSRHSPY", + + "WT_sequence": "PSRSRHSPY", + + "mutated_position": 1, + + "Kd": 192.03, + + "KdWT": 4582.17, + + "R": 0.8911371281207195, + + "logC": 2.263955023939215, + + "logA": 3.1722763542054815, + + "quality": 2.645601185190205 + + The above is an example output from a run. Each neoantigenic mutation will have an output like this. + + - id: This is a unique id that combines an id created from the mutation, HLA allele, and window. + - mutation_id : ID containing the chromosome, position, ref and alt allele. I and D denote insertions and deletions respectively. + - HLA_gene_id : The HLA gene this neoantigen binds to + - sequence : Mutated sequence + - WT_sequence : The wild type sequence + - mutated_position : The position of the first difference + - Kd: Binding affinity in nM from netMHCpan for the mutated peptide + - kdWT : Binding affinity in nM from netMHCpan for the wild type peptide + - R : Similarity of mutated peptide to IEDB peptides + - logC : the log of the cross-reactivity + - logA : Log of the amplitude. This is a function of kd/kdWT and a constant + - quality: The final output of the pipeline and neoantigen editing. A higher quality is a better neoantigen. This is decribed in the Luksza et al. paper and is visualized below + +
### Pipeline information diff --git a/modules.json b/modules.json index 1a9db25..bf1a62f 100644 --- a/modules.json +++ b/modules.json @@ -6,27 +6,27 @@ "modules": { "msk": { "neoantigenediting/aligntoiedb": { - "branch": "neoantigen", - "git_sha": "14ae2b3db25701828835e7145cee14dfdaddf180", + "branch": "develop", + "git_sha": "cac9c047e374ee259fb612ba5816e7e6aae6b86f", "installed_by": ["neoantigen_editing"] }, "neoantigenediting/computefitness": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "1f65c2ecdc5010549055ff7f4e6b8bccee48d4ae", "installed_by": ["neoantigen_editing"] }, "neoantigenutils/formatnetmhcpan": { - "branch": "neoantigen", - "git_sha": "fceccd3f96fec678849bb9bc0c04e53d9965f973", + "branch": "develop", + "git_sha": "c5d1252252e15555abcc82ea537cebeb281a1856", "installed_by": ["netmhcstabandpan"] }, "neoantigenutils/generatehlastring": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", "installed_by": ["netmhcstabandpan"] }, "neoantigenutils/generatemutfasta": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "bb7975c796ab9a2d7a45ef733a6a226a0f5ad74a", "installed_by": ["netmhcstabandpan"] }, @@ -36,32 +36,32 @@ "installed_by": ["modules"] }, "netmhcpan": { - "branch": "neoantigen", - "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", + "branch": "develop", + "git_sha": "503abeb67260f060d8228221b07d743aa4180345", "installed_by": ["modules", "netmhcstabandpan"] }, "netmhcstabpan": { - "branch": "neoantigen", - "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", + "branch": "develop", + "git_sha": "c1a473f8bc08f778269a36ab62d5adf24357225f", "installed_by": ["modules", "netmhcstabandpan"] }, "phylowgs/createinput": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "b031249dcf4279606c25e626da2a628756e75e8a", "installed_by": ["phylowgs"] }, "phylowgs/multievolve": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "535662d391a3533dea3b11c462c14799227e08b2", "installed_by": ["phylowgs"] }, "phylowgs/parsecnvs": { - "branch": "neoantigen", - "git_sha": "064ce5b42a8f711fb4d8107150aad2d382ae99c2", + "branch": "develop", + "git_sha": "8471691d7c29bc2f5f4fb92279c94fb2640b6c38", "installed_by": ["phylowgs"] }, "phylowgs/writeresults": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "6d27f08bf649e8680ace321d3127dcdf0e210973", "installed_by": ["phylowgs"] } @@ -70,17 +70,17 @@ "subworkflows": { "msk": { "neoantigen_editing": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "56a628201401866096d6307b9e8c690c5eb46ac2", "installed_by": ["subworkflows"] }, "netmhcstabandpan": { - "branch": "neoantigen", - "git_sha": "1b7ac020798572be26402a72dd9c1a22ce849a63", + "branch": "develop", + "git_sha": "d60211568e3709e9284bc06eef938e361d474d08", "installed_by": ["subworkflows"] }, "phylowgs": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "a5d61394af346f21ee2eb7ecfd97ab25bdbd1d0e", "installed_by": ["subworkflows"] } @@ -90,14 +90,9 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { - "fastqc": { - "branch": "master", - "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": ["modules"] - }, "multiqc": { "branch": "master", - "git_sha": "314d742bdb357a1df5f9b88427b3b6ac78aa33f7", + "git_sha": "b80f5fd12ff7c43938f424dd76392a2704fa2396", "installed_by": ["modules"] } } diff --git a/modules/msk/neoantigenediting/aligntoiedb/meta.yml b/modules/msk/neoantigenediting/aligntoiedb/meta.yml index 7a097b3..77d6121 100644 --- a/modules/msk/neoantigenediting/aligntoiedb/meta.yml +++ b/modules/msk/neoantigenediting/aligntoiedb/meta.yml @@ -1,7 +1,7 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "neoantigenediting_aligntoiedb" -description: Align neoantigens to the IEDB resource +description: Align neoantigens to the IEDB file keywords: - neoantigenediting - neoantigens diff --git a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py index 20886ca..9ff645a 100755 --- a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py +++ b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py @@ -47,14 +47,20 @@ def load_blosum62_mat(): * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 """ amino_acids = "ACDEFGHIKLMNPQRSTVWY" - blosum62_mat_str_list = [l.split() for l in raw_blosum62_mat_str.strip().split("\n")] + blosum62_mat_str_list = [ + l.split() for l in raw_blosum62_mat_str.strip().split("\n") + ] blosum_aa_order = [blosum62_mat_str_list[0].index(aa) for aa in amino_acids] blosum62_mat = np.zeros((len(amino_acids), len(amino_acids))) for i, bl_ind in enumerate(blosum_aa_order): - blosum62_mat[i] = np.array([int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]])[blosum_aa_order] + blosum62_mat[i] = np.array( + [int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]] + )[blosum_aa_order] blosum62 = { - (aaA, aaB): blosum62_mat[i, j] for i, aaA in enumerate(amino_acids) for j, aaB in enumerate(amino_acids) + (aaA, aaB): blosum62_mat[i, j] + for i, aaA in enumerate(amino_acids) + for j, aaB in enumerate(amino_acids) } return blosum62 @@ -226,7 +232,9 @@ def load_epitopes(iedbfasta): pjson = json.load(f) patient = pjson["patient"] neoantigens = pjson["neoantigens"] - peptides = set([("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens]) + peptides = set( + [("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens] + ) pepseq2pepid = defaultdict(set) for pep_id, pep_seq in peptides: pepseq2pepid[pep_seq].add(pep_id) @@ -251,5 +259,7 @@ def load_epitopes(iedbfasta): "Alignment_score", ] else: - aln_data = pd.DataFrame(columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"]) + aln_data = pd.DataFrame( + columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"] + ) aln_data.to_csv("iedb_alignments_" + patient + ".txt", sep="\t", index=False) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py index 03d78c4..f278817 100755 --- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py @@ -11,7 +11,6 @@ import os -# % class EpitopeDistance(object): """Base class for epitope crossreactivity. @@ -39,7 +38,9 @@ class EpitopeDistance(object): def __init__( self, - model_file=os.path.join(os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json"), + model_file=os.path.join( + os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json" + ), amino_acids="ACDEFGHIKLMNPQRSTVWY", ): """Initialize class and compute M_ab.""" @@ -75,5 +76,11 @@ def epitope_dist(self, epiA, epiB): """ return sum( - [self.d_i[i] * self.M_ab[self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]]] for i in range(9)] + [ + self.d_i[i] + * self.M_ab[ + self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]] + ] + for i in range(9) + ] ) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py index bc8cd54..da19f4d 100755 --- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py @@ -60,7 +60,9 @@ def fill_up_clone_neoantigens(tree, mut2neo): while len(nodes) > 0: node = nodes[0] nodes = nodes[1:] - node["neoantigens"] = [neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid]] + node["neoantigens"] = [ + neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid] + ] node["neoantigen_load"] = len(node["neoantigens"]) node["NA_Mut"] = sum([len(mut2neo[mid]) > 0 for mid in node["all_mutations"]]) if "children" in node: @@ -157,7 +159,9 @@ def compute_effective_sample_size(sample_json): for clone_muts, X in zip(clone_muts_list, freqs): for mid in clone_muts: mut_freqs[mid].append(X) - avev = np.mean([np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs]) + avev = np.mean( + [np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs] + ) n = 1 / avev return n diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf index 793432e..d6d67d1 100644 --- a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf +++ b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf @@ -36,10 +36,10 @@ process NEOANTIGENUTILS_FORMATNETMHCPAN { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def netmhcOutputType = meta.typeMut ? "MUT": "WT" + def netmhcOutputFrom = meta.fromStab ? "STAB": "PAN" """ - - touch ${prefix}.MUT.tsv - touch ${prefix}.WT.tsv + touch ${prefix}.${netmhcOutputType}.${netmhcOutputFrom}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": formatNetmhcpanOutput: \$(echo \$(format_netmhcpan_output.py -v)) diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py index 3b81dcb..a11be09 100755 --- a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py +++ b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py @@ -71,7 +71,9 @@ def netMHCpan_out_reformat(netMHCpanoutput, mut, stab, prefix): def parse_args(): parser = argparse.ArgumentParser(description="Process input files and parameters") - parser.add_argument("--netMHCpan_output", required=True, help="Path to netMHC output") + parser.add_argument( + "--netMHCpan_output", required=True, help="Path to netMHC output" + ) parser.add_argument("--type_MUT", action="store_true", help="Output is a MUT type") parser.add_argument( "--from_STAB", @@ -79,13 +81,17 @@ def parse_args(): help="Output is from netmhcstab", ) parser.add_argument("--id", required=True, help="Prefix to label the output") - parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) return parser.parse_args() def main(args): - netMHCpan_out_reformat(args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id) + netMHCpan_out_reformat( + args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id + ) if __name__ == "__main__": diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap index 28d29b9..685825a 100644 --- a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap +++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap @@ -30,11 +30,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:40.919561" + "timestamp": "2024-07-30T13:46:27.878268" }, "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv - stub": { "content": [ @@ -46,10 +42,7 @@ "typeMut": true, "fromStab": false }, - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ @@ -62,10 +55,7 @@ "typeMut": true, "fromStab": false }, - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "versions": [ @@ -73,11 +63,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:58:01.264912" + "timestamp": "2024-07-30T13:47:05.72509" }, "neoantigenutils_formatnetmhcpan - output(WT,netmhcpan) - tsv": { "content": [ @@ -110,11 +96,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:46.381516" + "timestamp": "2024-07-30T13:46:37.183992" }, "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpanstab) - tsv": { "content": [ @@ -147,11 +129,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:51.691253" + "timestamp": "2024-07-30T13:46:47.110076" }, "neoantigenutils_formatnetmhcpan - output(WT,netmhcpanstab) - tsv": { "content": [ @@ -184,10 +162,6 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:56.882818" + "timestamp": "2024-07-30T13:46:56.841519" } } \ No newline at end of file diff --git a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py index e09ab78..84d23fc 100755 --- a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py +++ b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py @@ -40,10 +40,18 @@ def main(): help="sample_id used to limit neoantigen prediction to identify mutations " "associated with the patient in the MAF (column 16). ", ) - required_arguments.add_argument("--output_dir", required=True, help="output directory") - required_arguments.add_argument("--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)") - required_arguments.add_argument("--CDS_file", required=True, help="expects a fa.gz file") - required_arguments.add_argument("--CDNA_file", required=True, help="expects a fa.gz file") + required_arguments.add_argument( + "--output_dir", required=True, help="output directory" + ) + required_arguments.add_argument( + "--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)" + ) + required_arguments.add_argument( + "--CDS_file", required=True, help="expects a fa.gz file" + ) + required_arguments.add_argument( + "--CDNA_file", required=True, help="expects a fa.gz file" + ) optional_arguments = parser.add_argument_group("Optional arguments") optional_arguments.add_argument( @@ -51,7 +59,9 @@ def main(): required=False, help="comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10", ) - optional_arguments.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + optional_arguments.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) args = parser.parse_args() @@ -74,7 +84,9 @@ def main(): # logger = logging.getLogger("neoantigen") logger.setLevel(logging.DEBUG) - console_formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S") + console_formatter = logging.Formatter( + "%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S" + ) # logfile handler handler_file = logging.FileHandler(output_dir + "/neoantigen_run.log", mode="w") @@ -103,7 +115,9 @@ def main(): logger.info("Finished loading reference CDS/cDNA sequences...") logger.info("Reading MAF file and constructing mutated peptides...") - maf_df = skip_lines_start_with(maf_file, "#", low_memory=False, header=0, sep="\t") + maf_df = skip_lines_start_with( + maf_file, "#", low_memory=False, header=0, sep="\t" + ) n_muts = n_non_syn_muts = n_missing_tx_id = 0 for index, row in maf_df.iterrows(): cds_seq = "" @@ -148,7 +162,13 @@ def main(): logger.info("\tMAF mutations summary") logger.info("\t\t# mutations: " + str(n_muts)) - logger.info("\t\t# non-syn: " + str(n_non_syn_muts) + " (# with missing CDS: " + str(n_missing_tx_id) + ")") + logger.info( + "\t\t# non-syn: " + + str(n_non_syn_muts) + + " (# with missing CDS: " + + str(n_missing_tx_id) + + ")" + ) except Exception: logger.error("Error while generating mutated peptides") @@ -263,12 +283,18 @@ def get_weak_binders(self): return [x for x in self.get_best_per_icore() if x.is_weak_binder()] def get_all_binders(self): - return [x for x in self.get_best_per_icore() if x.is_strong_binder() or x.is_weak_binder()] + return [ + x + for x in self.get_best_per_icore() + if x.is_strong_binder() or x.is_weak_binder() + ] def get_best_binder(self): if len(self.get_best_per_icore()) == 0: return None - return sorted(self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False)[0] + return sorted( + self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False + )[0] # @@ -401,7 +427,13 @@ def __init__(self, maf_row, cds_seq, cdna_seq): ) else: - self.identifier_key = str(self.maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code + self.identifier_key = ( + str(self.maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + ) print(self.identifier_key) ### Check if the variant_classification is among those that can generate a neoantigen @@ -415,7 +447,9 @@ def is_non_syn(self): "Nonstop_Mutation", ] - return self.maf_row["Variant_Classification"] in types and not pd.isnull(self.maf_row["HGVSp_Short"]) + return self.maf_row["Variant_Classification"] in types and not pd.isnull( + self.maf_row["HGVSp_Short"] + ) ### helper function #source: stackoverflow. def reverse_complement(self, dna): @@ -522,13 +556,19 @@ def generate_translated_sequences(self, pad_len=10): position, ref_allele, alt_allele, sequence, hgvsc_type = [-1, "", "", "", "ONP"] if re.match(r"^c\.(\d+).*([ATCG]+)>([ATCG]+)$", hgvsc): - position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*(\w+)>(\w+)", hgvsc).groups() + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*(\w+)>(\w+)", hgvsc + ).groups() elif re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc): - position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc).groups() + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc + ).groups() elif re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc): - position, hgvsc_type, sequence = re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc).groups() + position, hgvsc_type, sequence = re.match( + r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc + ).groups() else: sys.exit("Error: not one of the known HGVSc strings: " + hgvsc) @@ -572,7 +612,10 @@ def generate_translated_sequences(self, pad_len=10): mt_rev = mt[::-1] for i in range(0, min(len(wt), len(mt))): len_from_end = i - if len_from_end + len_from_start >= min(len(wt), len(mt)) or wt_rev[i : i + 1] != mt_rev[i : i + 1]: + if ( + len_from_end + len_from_start >= min(len(wt), len(mt)) + or wt_rev[i : i + 1] != mt_rev[i : i + 1] + ): break wt_start = len_from_start @@ -584,8 +627,12 @@ def generate_translated_sequences(self, pad_len=10): self.wt_aa = wt self.mt_aa = mt - self.wt_altered_aa = wt[max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1)] - self.mt_altered_aa = mt[max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1)] + self.wt_altered_aa = wt[ + max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1) + ] + self.mt_altered_aa = mt[ + max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1) + ] # function to iterate over all the the neopeptide predictions made in the entire MAF and identify # which neopeptides are generated by this mutation object @@ -617,7 +664,9 @@ def get_maf_row_to_print(self): row["neo_best_algorithm"] = best_binder.algorithm row["neo_best_version"] = best_binder.version row["neo_best_hla_allele"] = best_binder.hla_allele - row["neo_n_peptides_evaluated"] = len(self.predicted_neopeptides.get_best_per_icore()) + row["neo_n_peptides_evaluated"] = len( + self.predicted_neopeptides.get_best_per_icore() + ) row["neo_n_strong_binders"] = len(strong_binders) row["neo_n_weak_binders"] = len(weak_binders) else: diff --git a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py index c980561..5aa3e09 100755 --- a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py +++ b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py @@ -9,8 +9,8 @@ VERSION = 1.7 - def main(args): + def makeChild(subTree, start): if start: subTree = 0 @@ -37,10 +37,16 @@ def makeChild(subTree, start): pass else: for ssm in treefile["mut_assignments"][str(subTree)]["ssms"]: - ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]) + ssmli.append( + chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"] + ) newsubtree["clone_mutations"] = ssmli - newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] - newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] newsubtree["new_x"] = 0.0 except Exception as e: print("Error in adding new subtree. Error not in base case**") @@ -59,15 +65,21 @@ def makeChild(subTree, start): try: ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]) except Exception as e: - print("Error in appending to mutation list. Error in base case appending ssm to ssmli") + print( + "Error in appending to mutation list. Error in base case appending ssm to ssmli" + ) print(e) # print(str(subTree)) pass try: newsubtree["clone_mutations"] = ssmli - newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] - newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] newsubtree["new_x"] = 0.0 except Exception as e: print("Error in adding new subtree. Error in base case") @@ -93,7 +105,7 @@ def makeChild(subTree, start): for index, row in mafdf.iterrows(): if ( - # We + #We row["Variant_Type"] == "SNP" or row["Variant_Type"] == "DEL" or row["Variant_Type"] == "INS" @@ -343,13 +355,13 @@ def find_first_difference_index(str1, str2): # This is used as last resort for the matching. We will preferentially find the peptide matching in length as well as POS. Worst case we will default to the WT pos 0 if noposID not in WTdict: WTdict[noposID] = { - "peptides": {row_WT["peptide"]: id}, # This is a dict so we can match the peptide with the ID later - "affinity": row_WT["affinity"], + 'peptides' : {row_WT["peptide"]:id}, #This is a dict so we can match the peptide with the ID later + "affinity": row_WT["affinity"] } else: # print(WTdict[noposID]['peptides']) - WTdict[noposID]["peptides"][row_WT["peptide"]] = id + WTdict[noposID]['peptides'][row_WT["peptide"]]=id def find_most_similar_string(target, strings): max_score = -1 @@ -375,7 +387,7 @@ def find_most_similar_string(target, strings): max_score2 = score most_similar_string2 = s - if target[0] == s[0]: + if target[0]==s[0]: if score > first_AA_same_score: first_AA_same = s first_AA_same_score = score @@ -383,13 +395,13 @@ def find_most_similar_string(target, strings): return most_similar_string, most_similar_string2, first_AA_same, first_AA_same_score, max_score for index_mut, row_mut in neoantigen_mut_in.iterrows(): - IDsplit = row_mut["Identity"].split("_") - if row_mut["affinity"] < 500: + IDsplit = row_mut["Identity"].split('_') + if row_mut["affinity"]< 500: peplen = len(row_mut["peptide"]) matchfound = False - IDsplit = row_mut["Identity"].split("_") - if IDsplit[1][0] == "S" and IDsplit[1][1] != "p": - # If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp + IDsplit = row_mut["Identity"].split('_') + if (IDsplit[1][0] == "S" and IDsplit[1][1] != 'p') : + #If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp continue # first find match in WT WTid = ( @@ -410,7 +422,7 @@ def find_most_similar_string(target, strings): + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "") ) - if WTid in WTdict and ("M" == IDsplit[1][0] and "Sp" not in row_mut["Identity"]): + if WTid in WTdict and ('M' == IDsplit[1][0] and 'Sp' not in row_mut["Identity"]): # match matchfound = True best_pepmatch = WTdict[WTid]["peptide"] @@ -424,31 +436,28 @@ def find_most_similar_string(target, strings): # print(mutation_dict[row_mut["Identity"]]) else: - ( - best_pepmatch, - best_pepmatch2, - first_AA_same, - first_AA_same_score, - match_score, - ) = find_most_similar_string(row_mut["peptide"], list(WTdict[noposID]["peptides"].keys())) + best_pepmatch,best_pepmatch2 , first_AA_same, first_AA_same_score, match_score = find_most_similar_string(row_mut["peptide"],list(WTdict[noposID]['peptides'].keys())) if best_pepmatch == row_mut["peptide"]: - # it seems this can happen where the row_mut is actually the canonical sequence. + #it seems this can happen where the row_mut is actually the canonical sequence. # In this case we don't want to report the peptide as a neoantigen, its not neo continue - elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or ( - best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1] - ): + elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or (best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1]): # We should preferentially match the first AA if we can. I have found that the pairwise alignment isnt always the best at this. # It will also do this when the last AA of the best match doesnt match but the last A of the second best match does best_pepmatch = best_pepmatch2 - WTid = WTdict[noposID]["peptides"][best_pepmatch] - matchfound = True + WTid = WTdict[noposID]['peptides'][best_pepmatch] + matchfound=True if matchfound == True: - mut_pos = find_first_difference_index(row_mut["peptide"], best_pepmatch) + 1 # WTdict[WTid]["peptide"] + mut_pos = ( + find_first_difference_index( + row_mut["peptide"], best_pepmatch #WTdict[WTid]["peptide"] + ) + + 1 + ) neo_dict = { "id": row_mut["Identity"] @@ -461,7 +470,7 @@ def find_most_similar_string(target, strings): "mutation_id": mutation_dict[row_mut["Identity"]], "HLA_gene_id": row_mut["MHC"], "sequence": row_mut["peptide"], - "WT_sequence": best_pepmatch, # WTdict[WTid]["peptide"], + "WT_sequence": best_pepmatch ,#WTdict[WTid]["peptide"], "mutated_position": mut_pos, "Kd": float(row_mut["affinity"]), "KdWT": float(WTdict[WTid]["affinity"]), @@ -514,7 +523,7 @@ def makeID(maf_row): "Frame_shift_Del": "I-", "In_Frame_Ins": "If", "In_Frame_Del": "Id", - "Splice_Site": "Sp", + "Splice_Site": "Sp" } position = int(str(maf_row["Start_Position"])[0:2]) @@ -564,15 +573,26 @@ def makeID(maf_row): ) else: - identifier_key = str(maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code + "_M" + identifier_key = ( + str(maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + + "_M" + ) return identifier_key def parse_args(): parser = argparse.ArgumentParser(description="Process input files and parameters") parser.add_argument("--maf_file", required=True, help="Path to the MAF file") - parser.add_argument("--summary_file", required=True, help="Path to the summary file") - parser.add_argument("--mutation_file", required=True, help="Path to the mutation file") + parser.add_argument( + "--summary_file", required=True, help="Path to the summary file" + ) + parser.add_argument( + "--mutation_file", required=True, help="Path to the mutation file" + ) parser.add_argument( "--tree_directory", required=True, @@ -581,7 +601,9 @@ def parse_args(): parser.add_argument("--id", required=True, help="ID") parser.add_argument("--patient_id", required=True, help="Patient ID") parser.add_argument("--cohort", required=True, help="Cohort") - parser.add_argument("--HLA_genes", required=True, help="Path to the file containing HLA genes") + parser.add_argument( + "--HLA_genes", required=True, help="Path to the file containing HLA genes" + ) parser.add_argument( "--netMHCpan_MUT_input", required=True, @@ -597,7 +619,9 @@ def parse_args(): "--patient_data_file", help="Path to the optional file containing status, overall survival, and PFS", ) - parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) return parser.parse_args() diff --git a/modules/msk/netmhcpan/main.nf b/modules/msk/netmhcpan/main.nf index 4f03154..bc54431 100644 --- a/modules/msk/netmhcpan/main.nf +++ b/modules/msk/netmhcpan/main.nf @@ -25,14 +25,25 @@ process NETMHCPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = false + def NETMHCPAN_VERSION = "4.1" """ - /usr/local/bin/netMHCpan-4.1/netMHCpan -s 0 -BA 1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 -xls -xlsfile ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output + /usr/local/bin/netMHCpan-${NETMHCPAN_VERSION}/netMHCpan \ + -s 0 \ + -BA 1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 \ + -xls \ + ${args} \ + -xlsfile \ + ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 + netmhcpan: v${NETMHCPAN_VERSION} END_VERSIONS """ @@ -40,6 +51,7 @@ process NETMHCPAN { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def NETMHCPAN_VERSION = "4.1" output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = false @@ -49,7 +61,7 @@ process NETMHCPAN { cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 + netmhcpan: v${NETMHCPAN_VERSION} END_VERSIONS """ } diff --git a/modules/msk/netmhcstabpan/main.nf b/modules/msk/netmhcstabpan/main.nf index df5b61a..316195f 100644 --- a/modules/msk/netmhcstabpan/main.nf +++ b/modules/msk/netmhcstabpan/main.nf @@ -25,14 +25,23 @@ process NETMHCSTABPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = true + + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + """ - /usr/local/bin/netMHCstabpan-1.0/netMHCstabpan -s -1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output + /usr/local/bin/netMHCstabpan-${NETMHCSTABPAN_VERSION}/netMHCstabpan \ + -s -1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 - netmhcstabpan: v1.0 + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} END_VERSIONS """ @@ -43,14 +52,17 @@ process NETMHCSTABPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = true + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + """ touch ${prefix}.MUT.netmhcstabpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 - netmhcstabpan: v1.0 + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} END_VERSIONS """ } diff --git a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py index b565290..e15092a 100755 --- a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py +++ b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py @@ -22,7 +22,9 @@ class ReadCountsUnavailableError(Exception): class MafVariant(object): - def __init__(self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele): + def __init__( + self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele + ): self.CHROM = chromosome self.POS = position self.FILTER = filter @@ -105,8 +107,12 @@ def _get_tumor_index(self, variant, tumor_sample=None): Might not always be true """ if self._tumor_sample: - tumor_is = [i for i, s in enumerate(variant.samples) if s.sample == tumor_sample] - assert len(tumor_is) == 1, "Did not find tumor name %s in samples" % tumor_sample + tumor_is = [ + i for i, s in enumerate(variant.samples) if s.sample == tumor_sample + ] + assert len(tumor_is) == 1, ( + "Did not find tumor name %s in samples" % tumor_sample + ) return tumor_is[0] else: # Don't make this -1, as some code assumes it will be >= 0. @@ -146,10 +152,14 @@ def _calc_read_counts(self, variant): }, } - ref_reads = tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt] + ref_reads = ( + tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt] + ) # For now, variant reads are defined as only the non-reference nucleotide in # the inferred tumor SNP. We ignore reads of a third or fourth base. - variant_reads = tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt] + variant_reads = ( + tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt] + ) total_reads = ref_reads + variant_reads return (ref_reads, total_reads) @@ -235,7 +245,9 @@ def _calc_read_counts(self, variant): total_reads = 0 variant_reads = 0 else: - variant_reads = int(getattr(variant.samples[tumor_i].data, str(alt) + "U")[0]) + variant_reads = int( + getattr(variant.samples[tumor_i].data, str(alt) + "U")[0] + ) ref_reads = total_reads - variant_reads return (ref_reads, total_reads) @@ -299,6 +311,7 @@ def _calc_read_counts(self, variant): class MafParser(MutectSmchetParser): + def _calc_read_counts(self, variant): total_reads = variant.T_REF + variant.T_ALT return (variant.T_REF, total_reads) @@ -323,7 +336,9 @@ def _parse_maf(self, maf_filename): pos = start if variant_type == "DEL": pos = pos - 1 - variant = MafVariant(chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele) + variant = MafVariant( + chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele + ) variant_list.append(variant) return variant_list @@ -433,7 +448,9 @@ def _format_cnvs(self, cnvs, variants): for chrom, chrom_cnvs in cnvs.items(): for cnv in chrom_cnvs: - overlapping_variants = self._find_overlapping_variants(chrom, cnv, variants) + overlapping_variants = self._find_overlapping_variants( + chrom, cnv, variants + ) total_reads = self._calc_total_reads(cnv["start"], cnv["end"]) ref_reads = self._calc_ref_reads(cnv["cell_prev"], total_reads) yield { @@ -477,12 +494,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity): for K in ("chrom", "start", "end", "major_cn", "minor_cn"): physical_cnvs[K] = cnv[K] - assert len(set(physical_cnvs["major_cn"])) == len(set(physical_cnvs["major_cn"])) == 1 + assert ( + len(set(physical_cnvs["major_cn"])) + == len(set(physical_cnvs["major_cn"])) + == 1 + ) physical_cnvs["major_cn"] = physical_cnvs["major_cn"][0] physical_cnvs["minor_cn"] = physical_cnvs["minor_cn"][0] - physical_cnvs["cell_prev"] = "|".join([str(C) for C in cnv["cellular_prevalence"]]) - cnv["physical_cnvs"] = ",".join(["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()]) + physical_cnvs["cell_prev"] = "|".join( + [str(C) for C in cnv["cellular_prevalence"]] + ) + cnv["physical_cnvs"] = ",".join( + ["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()] + ) merged, formatted = formatted[:1], formatted[1:] merged[0]["cnv_id"] = "c0" @@ -496,13 +521,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity): # Only merge CNVs if they're clonal. If they're subclonal, leave them # free to move around the tree. - if np.array_equal(current["cellular_prevalence"], last["cellular_prevalence"]) and np.array_equal( - last["cellular_prevalence"], cellularity - ): + if np.array_equal( + current["cellular_prevalence"], last["cellular_prevalence"] + ) and np.array_equal(last["cellular_prevalence"], cellularity): # Merge the CNVs. - log("Merging %s_%s and %s_%s" % (current["chrom"], current["start"], last["chrom"], last["start"])) - last["total_reads"] = self._cap_cnv_D(current["total_reads"] + last["total_reads"]) - last["ref_reads"] = self._calc_ref_reads(last["cellular_prevalence"], last["total_reads"]) + log( + "Merging %s_%s and %s_%s" + % (current["chrom"], current["start"], last["chrom"], last["start"]) + ) + last["total_reads"] = self._cap_cnv_D( + current["total_reads"] + last["total_reads"] + ) + last["ref_reads"] = self._calc_ref_reads( + last["cellular_prevalence"], last["total_reads"] + ) last["physical_cnvs"] += ";" + current["physical_cnvs"] self._merge_variants(last, current) else: @@ -534,7 +566,9 @@ def _calc_ref_freq(self, ref_genotype, error_rate): raise Exception("Nonsensical frequency: %s" % freq) return freq - def format_variants(self, variants, ref_read_counts, total_read_counts, error_rate, sex): + def format_variants( + self, variants, ref_read_counts, total_read_counts, error_rate, sex + ): for variant_idx, variant in enumerate(variants): ssm_id = "s%s" % self._counter if hasattr(variant, "ID") and variant.ID is not None: @@ -649,7 +683,10 @@ def _create_intervals(self, cnv_set): # some intervals revealed that the code crashes on this input. We # should provide a more informative error in such cases, which the # following assertion does. - assert cnv["start"] < cnv["end"], "In CNV %s, start position occurs at or after the end position" % cnv + assert cnv["start"] < cnv["end"], ( + "In CNV %s, start position occurs at or after the end position" + % cnv + ) start_pos = [ ( @@ -670,7 +707,9 @@ def _create_intervals(self, cnv_set): # True > False, so this sorting will place start positions after end # positions if both have same coordinate. - positions = sorted(start_pos + end_pos, key=lambda e: (e[0], e[1] == "start")) + positions = sorted( + start_pos + end_pos, key=lambda e: (e[0], e[1] == "start") + ) assert len(positions) >= 2, "Fewer than two positions in %s" % positions # prev_pos is updated each time we move to a new coordinate on the @@ -701,7 +740,9 @@ def _create_intervals(self, cnv_set): assert locus > prev_pos interval = (prev_pos, locus) if interval[1] - interval[0] > min_size_for_inclusion: - intervals[chrom].append((interval[0], interval[1], sorted(list(set(open_samples))))) + intervals[chrom].append( + (interval[0], interval[1], sorted(list(set(open_samples)))) + ) else: # All points should be start points. assert set([i[1] for i in points_at_locus]) == set(["start"]) @@ -831,7 +872,9 @@ def _get_abnormal_state_for_all_samples(self, chrom, cnv): abnormal_state = None filtered = [] - for sampidx, cell_prev, major, minor in zip(cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"]): + for sampidx, cell_prev, major, minor in zip( + cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"] + ): # Region may be (clonal or subclonal) normal in a sample, so ignore such records. if self._is_region_normal_cn(chrom, major, minor): continue @@ -908,12 +951,15 @@ def load_single_abnormal_state_cnvs(self): if not is_good_chrom(chrom): continue for cnv in chrom_cnvs: - states_for_all_samples = self._get_abnormal_state_for_all_samples(chrom, cnv) + states_for_all_samples = self._get_abnormal_state_for_all_samples( + chrom, cnv + ) if states_for_all_samples is None: continue combined_states = { - K: np.array([S[K] for S in states_for_all_samples]) for K in states_for_all_samples[0].keys() + K: np.array([S[K] for S in states_for_all_samples]) + for K in states_for_all_samples[0].keys() } cnv.update(combined_states) abnormal_cnvs[chrom].append(cnv) @@ -932,7 +978,9 @@ def load_normal_cnvs(self): if not is_good_chrom(chrom): continue for cnv in chrom_cnvs: - if not self._is_multisample_region_normal_cn(chrom, cnv["major_cn"], cnv["minor_cn"]): + if not self._is_multisample_region_normal_cn( + chrom, cnv["major_cn"], cnv["minor_cn"] + ): continue if not set(cnv["sampidx"]) == self.sampidxs: continue @@ -1081,7 +1129,9 @@ def retain_only_variants_in_normal_cn_regions(self): raise Exception("CN regions not yet provided") normal_cn = self._multisamp_cnv.load_normal_cnvs() - filtered = self._filter_variants_outside_regions(normal_cn, "all_variants", "only_normal_cn") + filtered = self._filter_variants_outside_regions( + normal_cn, "all_variants", "only_normal_cn" + ) def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): # Battenberg: @@ -1104,9 +1154,13 @@ def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): # If variant isn't listed in *any* region: exclude (as we suspect CNV # caller didn't know what to do with the region). - self._filter_variants_outside_regions(self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions") + self._filter_variants_outside_regions( + self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions" + ) - def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, sex): + def format_variants( + self, sample_size, error_rate, priority_ssms, only_priority, sex + ): if sample_size is None: sample_size = len(self._variant_idxs) random.shuffle(self._variant_idxs) @@ -1135,7 +1189,11 @@ def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, else: nonsubsampled.append(variant_idx) - assert len(used_variant_idxs) == len(self._variant_idxs) == len(subsampled) + len(nonsubsampled) + assert ( + len(used_variant_idxs) + == len(self._variant_idxs) + == len(subsampled) + len(nonsubsampled) + ) subsampled.sort(key=lambda idx: variant_key(self._variants[idx])) subsampled_variants = get_elements_at_indices(self._variants, subsampled) @@ -1174,7 +1232,9 @@ def write_variants(self, variants, outfn): print("\t".join(("id", "gene", "a", "d", "mu_r", "mu_v")), file=outf) for variant in variants: variant["ref_reads"] = ",".join([str(v) for v in variant["ref_reads"]]) - variant["total_reads"] = ",".join([str(v) for v in variant["total_reads"]]) + variant["total_reads"] = ",".join( + [str(v) for v in variant["total_reads"]] + ) vals = ( "ssm_id", "variant_name", @@ -1190,7 +1250,10 @@ def _estimate_read_depth(self): read_sum = 0 if len(self._variants) == 0: default_read_depth = 50 - log("No variants available, so fixing read depth at %s." % default_read_depth) + log( + "No variants available, so fixing read depth at %s." + % default_read_depth + ) return default_read_depth else: return np.nanmedian(self._total_read_counts, axis=0) @@ -1198,7 +1261,9 @@ def _estimate_read_depth(self): def write_cnvs(self, variants, outfn): with open(outfn, "w") as outf: print("\t".join(("cnv", "a", "d", "ssms", "physical_cnvs")), file=outf) - formatter = CnvFormatter(self._estimated_read_depth, self._sampidxs, self._hetsnp_rate) + formatter = CnvFormatter( + self._estimated_read_depth, self._sampidxs, self._hetsnp_rate + ) for cnv in formatter.format_and_merge_cnvs( self._multisamp_cnv.load_single_abnormal_state_cnvs(), variants, @@ -1298,7 +1363,9 @@ def impute_missing_total_reads(total_reads, missing_variant_confidence): assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0 # Convert 1D arrays to vectors to permit matrix multiplication. - imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1))) + imputed_counts = np.dot( + variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)) + ) nan_coords = np.where(np.isnan(total_reads)) total_reads[nan_coords] = imputed_counts[nan_coords] assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0 @@ -1331,7 +1398,9 @@ def is_good_chrom(chrom): return False -def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence): +def parse_variants( + samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence +): parsed_variants = [] all_variant_ids = [] num_samples = len(samples) @@ -1376,7 +1445,11 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_ ) ) else: - variant_ids.append(VariantId(str(single_variant[0].CHROM), int(single_variant[0].POS), None)) + variant_ids.append( + VariantId( + str(single_variant[0].CHROM), int(single_variant[0].POS), None + ) + ) all_variant_ids += variant_ids all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates. @@ -1398,7 +1471,9 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_ ref_read_counts[variant_idx, sample_idx] = ref_reads total_read_counts[variant_idx, sample_idx] = total_reads - total_read_counts = impute_missing_total_reads(total_read_counts, missing_variant_confidence) + total_read_counts = impute_missing_total_reads( + total_read_counts, missing_variant_confidence + ) ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts) return (all_variant_ids, ref_read_counts, total_read_counts) @@ -1411,7 +1486,9 @@ def infer_sex(variant_ids): return "female" -def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples): +def extract_sample_data( + vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples +): vcf_files = {} vcf_types = {} cnv_files = {} @@ -1426,7 +1503,9 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_ should_use_cnvs = cnv_files_and_samples is not None if should_use_cnvs: - assert len(cnv_files_and_samples) == len(vcf_files_and_samples), "Must specify same number of VCF and CNV files" + assert len(cnv_files_and_samples) == len( + vcf_files_and_samples + ), "Must specify same number of VCF and CNV files" srcs_and_dsts.append((cnv_files_and_samples, cnv_files)) for src, dst in srcs_and_dsts: @@ -1436,20 +1515,28 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_ dst[sample] = val # Sample order will dictate eventual output order. - common_samps = reduce(lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts]) + common_samps = reduce( + lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts] + ) ordered_samps = [S.split("=", 1)[0] for S in vcf_files_and_samples] assert len(ordered_samps) == len(common_samps) # Ensure no duplicates. - assert set(vcf_files.keys()) == common_samps, "VCF file samples (%s) differ from common samples (%s)" % ( + assert ( + set(vcf_files.keys()) == common_samps + ), "VCF file samples (%s) differ from common samples (%s)" % ( vcf_files.keys(), common_samps, ) - assert set(vcf_types.keys()) == common_samps, "VCF type samples (%s) differ from common samples (%s)" % ( + assert ( + set(vcf_types.keys()) == common_samps + ), "VCF type samples (%s) differ from common samples (%s)" % ( vcf_types.keys(), common_samps, ) if should_use_cnvs: - assert set(cnv_files.keys()) == common_samps, "CNV file samples (%s) differ from CNV file samples (%s)" % ( + assert ( + set(cnv_files.keys()) == common_samps + ), "CNV file samples (%s) differ from CNV file samples (%s)" % ( cnv_files.keys(), common_samps, ) @@ -1600,7 +1687,9 @@ def main(): log.verbose = args.verbose params = {} - samples, vcf_files, vcf_types, cnv_files = extract_sample_data(args.vcf_files, args.vcf_types, args.cnv_files) + samples, vcf_files, vcf_types, cnv_files = extract_sample_data( + args.vcf_files, args.vcf_types, args.cnv_files + ) params["samples"], params["vcf_files"], params["vcf_types"], params["cnv_files"] = ( samples, vcf_files, @@ -1634,7 +1723,9 @@ def main(): grouper.add_cnvs(cn_regions, sex) if not grouper.has_cnvs(): - assert args.regions == "all", "If you do not provide CNA data, you must specify --regions=all" + assert ( + args.regions == "all" + ), "If you do not provide CNA data, you must specify --regions=all" if args.regions == "normal_cn": grouper.retain_only_variants_in_normal_cn_regions() @@ -1660,8 +1751,13 @@ def main(): if grouper.has_cnvs() and args.regions != "normal_cn": # Write CNVs. grouper.write_cnvs(subsampled_vars, args.output_cnvs) - if args.output_nonsubsampled_variants and args.output_nonsubsampled_variants_cnvs: - grouper.write_cnvs(nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs) + if ( + args.output_nonsubsampled_variants + and args.output_nonsubsampled_variants_cnvs + ): + grouper.write_cnvs( + nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs + ) else: # Write empty CNV file. with open(args.output_cnvs, "w"): diff --git a/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py index e8ac2c3..e1fdbdc 100755 --- a/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py +++ b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py @@ -131,7 +131,8 @@ def parse(self): and str.isdigit(record["lcn.em"]) and str.isdigit(record["cf.em"].replace(".", "", 1)) ): - cnv["cellular_prevalence"] = float(record["cf.em"]) * self._cellularity + cnv["cellular_prevalence"] = ( + float(record["cf.em"])) cnv["minor_cn"] = int(record["lcn.em"]) cnv["major_cn"] = int(record["tcn.em"]) - cnv["minor_cn"] chrom = record["chrom"] @@ -181,7 +182,9 @@ def parse(self): cnv1["end"] = end cnv1["major_cn"] = int(fields[8 + self._field_offset]) cnv1["minor_cn"] = int(fields[9 + self._field_offset]) - cnv1["cellular_prevalence"] = float(fields[10 + self._field_offset]) * self._cellularity + cnv1["cellular_prevalence"] = ( + float(fields[10 + self._field_offset]) * self._cellularity + ) cnv2 = None # Stefan's comment on p values: The p-values correspond "to whether a @@ -198,11 +201,15 @@ def parse(self): cnv2["end"] = end cnv2["major_cn"] = int(fields[11 + self._field_offset]) cnv2["minor_cn"] = int(fields[12 + self._field_offset]) - cnv2["cellular_prevalence"] = float(fields[13 + self._field_offset]) * self._cellularity + cnv2["cellular_prevalence"] = ( + float(fields[13 + self._field_offset]) * self._cellularity + ) else: cnv1["cellular_prevalence"] = self._cellularity - if cnv1["start"] >= cnv1["end"] or (cnv2 is not None and cnv2["start"] >= cnv2["end"]): + if cnv1["start"] >= cnv1["end"] or ( + cnv2 is not None and cnv2["start"] >= cnv2["end"] + ): continue cn_regions[chrom].append(cnv1) @@ -258,7 +265,8 @@ def main(): if args.cellularity > 1.0: print( - "Cellularity for %s is %s. Setting to 1.0." % (args.cnv_file, args.cellularity), + "Cellularity for %s is %s. Setting to 1.0." + % (args.cnv_file, args.cellularity), file=sys.stderr, ) cellularity = 1.0 diff --git a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap index 58351ec..7d9d8ad 100644 --- a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap +++ b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap @@ -8,7 +8,7 @@ "id": "test", "single_end": false }, - "cnvs.txt:md5,5ba8c1c06170c6d7ee9c18026b710154" + "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7" ] ], "1": [ @@ -20,7 +20,7 @@ "id": "test", "single_end": false }, - "cnvs.txt:md5,5ba8c1c06170c6d7ee9c18026b710154" + "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7" ] ], "versions": [ @@ -28,7 +28,7 @@ ] } ], - "timestamp": "2024-06-11T18:02:28.983981" + "timestamp": "2024-07-08T12:15:00.257251" }, "phylowgs_parsecnvs - txt - stub": { "content": [ @@ -59,6 +59,6 @@ ] } ], - "timestamp": "2024-06-11T18:02:36.26591" + "timestamp": "2024-07-08T12:15:19.111149" } } \ No newline at end of file diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml deleted file mode 100644 index 1787b38..0000000 --- a/modules/nf-core/fastqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: fastqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index d79f1c8..0000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,61 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') - - def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') - // FastQC memory value allowed range (100 - 10000) - def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) - - """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - $args \\ - --threads $task.cpus \\ - --memory $fastqc_memory \\ - $renamed_files - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index ee5507e..0000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index 70edae4..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,212 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("sarscov2 single-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_single") } - ) - } - } - - test("sarscov2 paired-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_paired") } - ) - } - } - - test("sarscov2 interleaved [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") } - ) - } - } - - test("sarscov2 paired-end [bam]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_bam") } - ) - } - } - - test("sarscov2 multiple [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, - { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, - { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_multiple") } - ) - } - } - - test("sarscov2 custom_prefix") { - - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") } - ) - } - } - - test("sarscov2 single-end [fastq] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out.html.collect { file(it[1]).getName() } + - process.out.zip.collect { file(it[1]).getName() } + - process.out.versions ).match("fastqc_stub") } - ) - } - } - -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index 86f7c31..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,88 +0,0 @@ -{ - "fastqc_versions_interleaved": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:07.293713" - }, - "fastqc_stub": { - "content": [ - [ - "test.html", - "test.zip", - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:31:01.425198" - }, - "fastqc_versions_multiple": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:55.797907" - }, - "fastqc_versions_bam": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:26.795862" - }, - "fastqc_versions_single": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:27.043675" - }, - "fastqc_versions_paired": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:47.584191" - }, - "fastqc_versions_custom_prefix": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:41:14.576531" - } -} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294..0000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index ecb7dd7..2121492 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.22.3 + - bioconda::multiqc=1.23 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 2581a49..459dfea 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,14 +3,16 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.22.3--pyhdfd78af_0' : - 'biocontainers/multiqc:1.22.3--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.23--pyhdfd78af_0' : + 'biocontainers/multiqc:1.23--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) + path(replace_names) + path(sample_names) output: path "*multiqc_report.html", emit: report @@ -26,6 +28,8 @@ process MULTIQC { def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ @@ -33,6 +37,8 @@ process MULTIQC { $config \\ $extra_config \\ $logo \\ + $replace \\ + $samples \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc3..382c08c 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -29,6 +29,19 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" output: - report: type: file diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index f1c4242..6aa27f4 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -17,6 +17,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -41,6 +43,8 @@ nextflow_process { input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -66,6 +70,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index 0a4760e..45e95e5 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-25T12:31:21.878452033" + "timestamp": "2024-07-10T12:41:34.562023" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-25T12:32:02.322196503" + "timestamp": "2024-07-10T11:27:11.933869532" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-25T12:31:50.064227638" + "timestamp": "2024-07-10T11:26:56.709849369" } } \ No newline at end of file diff --git a/subworkflows/msk/netmhcstabandpan/main.nf b/subworkflows/msk/netmhcstabandpan/main.nf index d0425ef..ea8247c 100644 --- a/subworkflows/msk/netmhcstabandpan/main.nf +++ b/subworkflows/msk/netmhcstabandpan/main.nf @@ -91,13 +91,5 @@ def createNETMHCInput(wt_fasta, mut_fasta, hla) { new Tuple(it[1][0], it[1][1],it[2][1],"WT") } merged = merged_mut.mix(merged_wt) - - -// merged = merged_mut -// .join(merged_wt) -// .map{ -// new Tuple(it[1][0],it[1][1],it[1][2],it[1][3]) -// } - return merged } diff --git a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap index 0d3ce18..dbde65e 100644 --- a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap +++ b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap @@ -6,26 +6,15 @@ "test.MUT_sequences.fa:md5,7fdb7d3f0fe5a6f439ed294b612c2d70", "test.WT_sequences.fa:md5,7595ed6cf0c98500b00c9ad027125b38" ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-11T16:27:38.189273" + "timestamp": "2024-07-30T13:48:55.729458" }, "netmhcstabandpan - tsv,xls,fa - stub": { "content": [ - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ], + "test.WT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", "test.MUT.xls", "test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e", "test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-11T16:27:49.026711" + "timestamp": "2024-07-30T13:49:11.413783" } } \ No newline at end of file