From 5df2593318a198e45f24376129fe5b49fbf556c8 Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Date: Tue, 30 Jul 2024 14:39:13 -0400 Subject: [PATCH 1/2] Update to use modules from develop --- modules.json | 38 ++-- .../neoantigenediting/aligntoiedb/meta.yml | 2 +- .../usr/bin/align_neoantigens_to_IEDB.py | 20 +- .../resources/usr/bin/EpitopeDistance.py | 15 +- .../resources/usr/bin/compute_fitness.py | 8 +- .../neoantigenutils/formatnetmhcpan/main.nf | 6 +- .../usr/bin/format_netmhcpan_output.py | 12 +- .../formatnetmhcpan/tests/main.nf.test.snap | 40 +--- .../resources/usr/bin/generateMutFasta.py | 87 ++++++-- .../resources/usr/bin/generate_input.py | 102 ++++++---- modules/msk/netmhcpan/main.nf | 18 +- modules/msk/netmhcstabpan/main.nf | 22 +- .../usr/bin/create_phylowgs_inputs.py | 188 +++++++++++++----- subworkflows/msk/netmhcstabandpan/main.nf | 8 - .../netmhcstabandpan/tests/main.nf.test.snap | 17 +- 15 files changed, 379 insertions(+), 204 deletions(-) diff --git a/modules.json b/modules.json index a332e31..bf1a62f 100644 --- a/modules.json +++ b/modules.json @@ -6,27 +6,27 @@ "modules": { "msk": { "neoantigenediting/aligntoiedb": { - "branch": "neoantigen", - "git_sha": "14ae2b3db25701828835e7145cee14dfdaddf180", + "branch": "develop", + "git_sha": "cac9c047e374ee259fb612ba5816e7e6aae6b86f", "installed_by": ["neoantigen_editing"] }, "neoantigenediting/computefitness": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "1f65c2ecdc5010549055ff7f4e6b8bccee48d4ae", "installed_by": ["neoantigen_editing"] }, "neoantigenutils/formatnetmhcpan": { - "branch": "neoantigen", - "git_sha": "fceccd3f96fec678849bb9bc0c04e53d9965f973", + "branch": "develop", + "git_sha": "c5d1252252e15555abcc82ea537cebeb281a1856", "installed_by": ["netmhcstabandpan"] }, "neoantigenutils/generatehlastring": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", "installed_by": ["netmhcstabandpan"] }, "neoantigenutils/generatemutfasta": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "bb7975c796ab9a2d7a45ef733a6a226a0f5ad74a", "installed_by": ["netmhcstabandpan"] }, @@ -36,32 +36,32 @@ "installed_by": ["modules"] }, "netmhcpan": { - "branch": "neoantigen", - "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", + "branch": "develop", + "git_sha": "503abeb67260f060d8228221b07d743aa4180345", "installed_by": ["modules", "netmhcstabandpan"] }, "netmhcstabpan": { - "branch": "neoantigen", - "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", + "branch": "develop", + "git_sha": "c1a473f8bc08f778269a36ab62d5adf24357225f", "installed_by": ["modules", "netmhcstabandpan"] }, "phylowgs/createinput": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "b031249dcf4279606c25e626da2a628756e75e8a", "installed_by": ["phylowgs"] }, "phylowgs/multievolve": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "535662d391a3533dea3b11c462c14799227e08b2", "installed_by": ["phylowgs"] }, "phylowgs/parsecnvs": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "8471691d7c29bc2f5f4fb92279c94fb2640b6c38", "installed_by": ["phylowgs"] }, "phylowgs/writeresults": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "6d27f08bf649e8680ace321d3127dcdf0e210973", "installed_by": ["phylowgs"] } @@ -70,17 +70,17 @@ "subworkflows": { "msk": { "neoantigen_editing": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "56a628201401866096d6307b9e8c690c5eb46ac2", "installed_by": ["subworkflows"] }, "netmhcstabandpan": { - "branch": "neoantigen", - "git_sha": "1b7ac020798572be26402a72dd9c1a22ce849a63", + "branch": "develop", + "git_sha": "d60211568e3709e9284bc06eef938e361d474d08", "installed_by": ["subworkflows"] }, "phylowgs": { - "branch": "neoantigen", + "branch": "develop", "git_sha": "a5d61394af346f21ee2eb7ecfd97ab25bdbd1d0e", "installed_by": ["subworkflows"] } diff --git a/modules/msk/neoantigenediting/aligntoiedb/meta.yml b/modules/msk/neoantigenediting/aligntoiedb/meta.yml index 7a097b3..77d6121 100644 --- a/modules/msk/neoantigenediting/aligntoiedb/meta.yml +++ b/modules/msk/neoantigenediting/aligntoiedb/meta.yml @@ -1,7 +1,7 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "neoantigenediting_aligntoiedb" -description: Align neoantigens to the IEDB resource +description: Align neoantigens to the IEDB file keywords: - neoantigenediting - neoantigens diff --git a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py index 20886ca..9ff645a 100755 --- a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py +++ b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py @@ -47,14 +47,20 @@ def load_blosum62_mat(): * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 """ amino_acids = "ACDEFGHIKLMNPQRSTVWY" - blosum62_mat_str_list = [l.split() for l in raw_blosum62_mat_str.strip().split("\n")] + blosum62_mat_str_list = [ + l.split() for l in raw_blosum62_mat_str.strip().split("\n") + ] blosum_aa_order = [blosum62_mat_str_list[0].index(aa) for aa in amino_acids] blosum62_mat = np.zeros((len(amino_acids), len(amino_acids))) for i, bl_ind in enumerate(blosum_aa_order): - blosum62_mat[i] = np.array([int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]])[blosum_aa_order] + blosum62_mat[i] = np.array( + [int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]] + )[blosum_aa_order] blosum62 = { - (aaA, aaB): blosum62_mat[i, j] for i, aaA in enumerate(amino_acids) for j, aaB in enumerate(amino_acids) + (aaA, aaB): blosum62_mat[i, j] + for i, aaA in enumerate(amino_acids) + for j, aaB in enumerate(amino_acids) } return blosum62 @@ -226,7 +232,9 @@ def load_epitopes(iedbfasta): pjson = json.load(f) patient = pjson["patient"] neoantigens = pjson["neoantigens"] - peptides = set([("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens]) + peptides = set( + [("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens] + ) pepseq2pepid = defaultdict(set) for pep_id, pep_seq in peptides: pepseq2pepid[pep_seq].add(pep_id) @@ -251,5 +259,7 @@ def load_epitopes(iedbfasta): "Alignment_score", ] else: - aln_data = pd.DataFrame(columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"]) + aln_data = pd.DataFrame( + columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"] + ) aln_data.to_csv("iedb_alignments_" + patient + ".txt", sep="\t", index=False) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py index 03d78c4..ea09073 100755 --- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py @@ -10,8 +10,7 @@ import json import os - -# % +#% class EpitopeDistance(object): """Base class for epitope crossreactivity. @@ -39,7 +38,9 @@ class EpitopeDistance(object): def __init__( self, - model_file=os.path.join(os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json"), + model_file=os.path.join( + os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json" + ), amino_acids="ACDEFGHIKLMNPQRSTVWY", ): """Initialize class and compute M_ab.""" @@ -75,5 +76,11 @@ def epitope_dist(self, epiA, epiB): """ return sum( - [self.d_i[i] * self.M_ab[self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]]] for i in range(9)] + [ + self.d_i[i] + * self.M_ab[ + self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]] + ] + for i in range(9) + ] ) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py index bc8cd54..da19f4d 100755 --- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py @@ -60,7 +60,9 @@ def fill_up_clone_neoantigens(tree, mut2neo): while len(nodes) > 0: node = nodes[0] nodes = nodes[1:] - node["neoantigens"] = [neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid]] + node["neoantigens"] = [ + neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid] + ] node["neoantigen_load"] = len(node["neoantigens"]) node["NA_Mut"] = sum([len(mut2neo[mid]) > 0 for mid in node["all_mutations"]]) if "children" in node: @@ -157,7 +159,9 @@ def compute_effective_sample_size(sample_json): for clone_muts, X in zip(clone_muts_list, freqs): for mid in clone_muts: mut_freqs[mid].append(X) - avev = np.mean([np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs]) + avev = np.mean( + [np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs] + ) n = 1 / avev return n diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf index 793432e..d6d67d1 100644 --- a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf +++ b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf @@ -36,10 +36,10 @@ process NEOANTIGENUTILS_FORMATNETMHCPAN { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def netmhcOutputType = meta.typeMut ? "MUT": "WT" + def netmhcOutputFrom = meta.fromStab ? "STAB": "PAN" """ - - touch ${prefix}.MUT.tsv - touch ${prefix}.WT.tsv + touch ${prefix}.${netmhcOutputType}.${netmhcOutputFrom}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": formatNetmhcpanOutput: \$(echo \$(format_netmhcpan_output.py -v)) diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py index 3b81dcb..a11be09 100755 --- a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py +++ b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py @@ -71,7 +71,9 @@ def netMHCpan_out_reformat(netMHCpanoutput, mut, stab, prefix): def parse_args(): parser = argparse.ArgumentParser(description="Process input files and parameters") - parser.add_argument("--netMHCpan_output", required=True, help="Path to netMHC output") + parser.add_argument( + "--netMHCpan_output", required=True, help="Path to netMHC output" + ) parser.add_argument("--type_MUT", action="store_true", help="Output is a MUT type") parser.add_argument( "--from_STAB", @@ -79,13 +81,17 @@ def parse_args(): help="Output is from netmhcstab", ) parser.add_argument("--id", required=True, help="Prefix to label the output") - parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) return parser.parse_args() def main(args): - netMHCpan_out_reformat(args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id) + netMHCpan_out_reformat( + args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id + ) if __name__ == "__main__": diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap index 28d29b9..685825a 100644 --- a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap +++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap @@ -30,11 +30,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:40.919561" + "timestamp": "2024-07-30T13:46:27.878268" }, "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv - stub": { "content": [ @@ -46,10 +42,7 @@ "typeMut": true, "fromStab": false }, - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ @@ -62,10 +55,7 @@ "typeMut": true, "fromStab": false }, - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "versions": [ @@ -73,11 +63,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:58:01.264912" + "timestamp": "2024-07-30T13:47:05.72509" }, "neoantigenutils_formatnetmhcpan - output(WT,netmhcpan) - tsv": { "content": [ @@ -110,11 +96,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:46.381516" + "timestamp": "2024-07-30T13:46:37.183992" }, "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpanstab) - tsv": { "content": [ @@ -147,11 +129,7 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:51.691253" + "timestamp": "2024-07-30T13:46:47.110076" }, "neoantigenutils_formatnetmhcpan - output(WT,netmhcpanstab) - tsv": { "content": [ @@ -184,10 +162,6 @@ ] } ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-13T12:57:56.882818" + "timestamp": "2024-07-30T13:46:56.841519" } } \ No newline at end of file diff --git a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py index e09ab78..84d23fc 100755 --- a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py +++ b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py @@ -40,10 +40,18 @@ def main(): help="sample_id used to limit neoantigen prediction to identify mutations " "associated with the patient in the MAF (column 16). ", ) - required_arguments.add_argument("--output_dir", required=True, help="output directory") - required_arguments.add_argument("--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)") - required_arguments.add_argument("--CDS_file", required=True, help="expects a fa.gz file") - required_arguments.add_argument("--CDNA_file", required=True, help="expects a fa.gz file") + required_arguments.add_argument( + "--output_dir", required=True, help="output directory" + ) + required_arguments.add_argument( + "--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)" + ) + required_arguments.add_argument( + "--CDS_file", required=True, help="expects a fa.gz file" + ) + required_arguments.add_argument( + "--CDNA_file", required=True, help="expects a fa.gz file" + ) optional_arguments = parser.add_argument_group("Optional arguments") optional_arguments.add_argument( @@ -51,7 +59,9 @@ def main(): required=False, help="comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10", ) - optional_arguments.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + optional_arguments.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) args = parser.parse_args() @@ -74,7 +84,9 @@ def main(): # logger = logging.getLogger("neoantigen") logger.setLevel(logging.DEBUG) - console_formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S") + console_formatter = logging.Formatter( + "%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S" + ) # logfile handler handler_file = logging.FileHandler(output_dir + "/neoantigen_run.log", mode="w") @@ -103,7 +115,9 @@ def main(): logger.info("Finished loading reference CDS/cDNA sequences...") logger.info("Reading MAF file and constructing mutated peptides...") - maf_df = skip_lines_start_with(maf_file, "#", low_memory=False, header=0, sep="\t") + maf_df = skip_lines_start_with( + maf_file, "#", low_memory=False, header=0, sep="\t" + ) n_muts = n_non_syn_muts = n_missing_tx_id = 0 for index, row in maf_df.iterrows(): cds_seq = "" @@ -148,7 +162,13 @@ def main(): logger.info("\tMAF mutations summary") logger.info("\t\t# mutations: " + str(n_muts)) - logger.info("\t\t# non-syn: " + str(n_non_syn_muts) + " (# with missing CDS: " + str(n_missing_tx_id) + ")") + logger.info( + "\t\t# non-syn: " + + str(n_non_syn_muts) + + " (# with missing CDS: " + + str(n_missing_tx_id) + + ")" + ) except Exception: logger.error("Error while generating mutated peptides") @@ -263,12 +283,18 @@ def get_weak_binders(self): return [x for x in self.get_best_per_icore() if x.is_weak_binder()] def get_all_binders(self): - return [x for x in self.get_best_per_icore() if x.is_strong_binder() or x.is_weak_binder()] + return [ + x + for x in self.get_best_per_icore() + if x.is_strong_binder() or x.is_weak_binder() + ] def get_best_binder(self): if len(self.get_best_per_icore()) == 0: return None - return sorted(self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False)[0] + return sorted( + self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False + )[0] # @@ -401,7 +427,13 @@ def __init__(self, maf_row, cds_seq, cdna_seq): ) else: - self.identifier_key = str(self.maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code + self.identifier_key = ( + str(self.maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + ) print(self.identifier_key) ### Check if the variant_classification is among those that can generate a neoantigen @@ -415,7 +447,9 @@ def is_non_syn(self): "Nonstop_Mutation", ] - return self.maf_row["Variant_Classification"] in types and not pd.isnull(self.maf_row["HGVSp_Short"]) + return self.maf_row["Variant_Classification"] in types and not pd.isnull( + self.maf_row["HGVSp_Short"] + ) ### helper function #source: stackoverflow. def reverse_complement(self, dna): @@ -522,13 +556,19 @@ def generate_translated_sequences(self, pad_len=10): position, ref_allele, alt_allele, sequence, hgvsc_type = [-1, "", "", "", "ONP"] if re.match(r"^c\.(\d+).*([ATCG]+)>([ATCG]+)$", hgvsc): - position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*(\w+)>(\w+)", hgvsc).groups() + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*(\w+)>(\w+)", hgvsc + ).groups() elif re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc): - position, ref_allele, alt_allele = re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc).groups() + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc + ).groups() elif re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc): - position, hgvsc_type, sequence = re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc).groups() + position, hgvsc_type, sequence = re.match( + r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc + ).groups() else: sys.exit("Error: not one of the known HGVSc strings: " + hgvsc) @@ -572,7 +612,10 @@ def generate_translated_sequences(self, pad_len=10): mt_rev = mt[::-1] for i in range(0, min(len(wt), len(mt))): len_from_end = i - if len_from_end + len_from_start >= min(len(wt), len(mt)) or wt_rev[i : i + 1] != mt_rev[i : i + 1]: + if ( + len_from_end + len_from_start >= min(len(wt), len(mt)) + or wt_rev[i : i + 1] != mt_rev[i : i + 1] + ): break wt_start = len_from_start @@ -584,8 +627,12 @@ def generate_translated_sequences(self, pad_len=10): self.wt_aa = wt self.mt_aa = mt - self.wt_altered_aa = wt[max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1)] - self.mt_altered_aa = mt[max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1)] + self.wt_altered_aa = wt[ + max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1) + ] + self.mt_altered_aa = mt[ + max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1) + ] # function to iterate over all the the neopeptide predictions made in the entire MAF and identify # which neopeptides are generated by this mutation object @@ -617,7 +664,9 @@ def get_maf_row_to_print(self): row["neo_best_algorithm"] = best_binder.algorithm row["neo_best_version"] = best_binder.version row["neo_best_hla_allele"] = best_binder.hla_allele - row["neo_n_peptides_evaluated"] = len(self.predicted_neopeptides.get_best_per_icore()) + row["neo_n_peptides_evaluated"] = len( + self.predicted_neopeptides.get_best_per_icore() + ) row["neo_n_strong_binders"] = len(strong_binders) row["neo_n_weak_binders"] = len(weak_binders) else: diff --git a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py index c980561..5aa3e09 100755 --- a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py +++ b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py @@ -9,8 +9,8 @@ VERSION = 1.7 - def main(args): + def makeChild(subTree, start): if start: subTree = 0 @@ -37,10 +37,16 @@ def makeChild(subTree, start): pass else: for ssm in treefile["mut_assignments"][str(subTree)]["ssms"]: - ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]) + ssmli.append( + chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"] + ) newsubtree["clone_mutations"] = ssmli - newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] - newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] newsubtree["new_x"] = 0.0 except Exception as e: print("Error in adding new subtree. Error not in base case**") @@ -59,15 +65,21 @@ def makeChild(subTree, start): try: ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]) except Exception as e: - print("Error in appending to mutation list. Error in base case appending ssm to ssmli") + print( + "Error in appending to mutation list. Error in base case appending ssm to ssmli" + ) print(e) # print(str(subTree)) pass try: newsubtree["clone_mutations"] = ssmli - newsubtree["X"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] - newsubtree["x"] = trees[tree]["populations"][str(subTree)]["cellular_prevalence"][0] + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] newsubtree["new_x"] = 0.0 except Exception as e: print("Error in adding new subtree. Error in base case") @@ -93,7 +105,7 @@ def makeChild(subTree, start): for index, row in mafdf.iterrows(): if ( - # We + #We row["Variant_Type"] == "SNP" or row["Variant_Type"] == "DEL" or row["Variant_Type"] == "INS" @@ -343,13 +355,13 @@ def find_first_difference_index(str1, str2): # This is used as last resort for the matching. We will preferentially find the peptide matching in length as well as POS. Worst case we will default to the WT pos 0 if noposID not in WTdict: WTdict[noposID] = { - "peptides": {row_WT["peptide"]: id}, # This is a dict so we can match the peptide with the ID later - "affinity": row_WT["affinity"], + 'peptides' : {row_WT["peptide"]:id}, #This is a dict so we can match the peptide with the ID later + "affinity": row_WT["affinity"] } else: # print(WTdict[noposID]['peptides']) - WTdict[noposID]["peptides"][row_WT["peptide"]] = id + WTdict[noposID]['peptides'][row_WT["peptide"]]=id def find_most_similar_string(target, strings): max_score = -1 @@ -375,7 +387,7 @@ def find_most_similar_string(target, strings): max_score2 = score most_similar_string2 = s - if target[0] == s[0]: + if target[0]==s[0]: if score > first_AA_same_score: first_AA_same = s first_AA_same_score = score @@ -383,13 +395,13 @@ def find_most_similar_string(target, strings): return most_similar_string, most_similar_string2, first_AA_same, first_AA_same_score, max_score for index_mut, row_mut in neoantigen_mut_in.iterrows(): - IDsplit = row_mut["Identity"].split("_") - if row_mut["affinity"] < 500: + IDsplit = row_mut["Identity"].split('_') + if row_mut["affinity"]< 500: peplen = len(row_mut["peptide"]) matchfound = False - IDsplit = row_mut["Identity"].split("_") - if IDsplit[1][0] == "S" and IDsplit[1][1] != "p": - # If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp + IDsplit = row_mut["Identity"].split('_') + if (IDsplit[1][0] == "S" and IDsplit[1][1] != 'p') : + #If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp continue # first find match in WT WTid = ( @@ -410,7 +422,7 @@ def find_most_similar_string(target, strings): + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "") ) - if WTid in WTdict and ("M" == IDsplit[1][0] and "Sp" not in row_mut["Identity"]): + if WTid in WTdict and ('M' == IDsplit[1][0] and 'Sp' not in row_mut["Identity"]): # match matchfound = True best_pepmatch = WTdict[WTid]["peptide"] @@ -424,31 +436,28 @@ def find_most_similar_string(target, strings): # print(mutation_dict[row_mut["Identity"]]) else: - ( - best_pepmatch, - best_pepmatch2, - first_AA_same, - first_AA_same_score, - match_score, - ) = find_most_similar_string(row_mut["peptide"], list(WTdict[noposID]["peptides"].keys())) + best_pepmatch,best_pepmatch2 , first_AA_same, first_AA_same_score, match_score = find_most_similar_string(row_mut["peptide"],list(WTdict[noposID]['peptides'].keys())) if best_pepmatch == row_mut["peptide"]: - # it seems this can happen where the row_mut is actually the canonical sequence. + #it seems this can happen where the row_mut is actually the canonical sequence. # In this case we don't want to report the peptide as a neoantigen, its not neo continue - elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or ( - best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1] - ): + elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or (best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1]): # We should preferentially match the first AA if we can. I have found that the pairwise alignment isnt always the best at this. # It will also do this when the last AA of the best match doesnt match but the last A of the second best match does best_pepmatch = best_pepmatch2 - WTid = WTdict[noposID]["peptides"][best_pepmatch] - matchfound = True + WTid = WTdict[noposID]['peptides'][best_pepmatch] + matchfound=True if matchfound == True: - mut_pos = find_first_difference_index(row_mut["peptide"], best_pepmatch) + 1 # WTdict[WTid]["peptide"] + mut_pos = ( + find_first_difference_index( + row_mut["peptide"], best_pepmatch #WTdict[WTid]["peptide"] + ) + + 1 + ) neo_dict = { "id": row_mut["Identity"] @@ -461,7 +470,7 @@ def find_most_similar_string(target, strings): "mutation_id": mutation_dict[row_mut["Identity"]], "HLA_gene_id": row_mut["MHC"], "sequence": row_mut["peptide"], - "WT_sequence": best_pepmatch, # WTdict[WTid]["peptide"], + "WT_sequence": best_pepmatch ,#WTdict[WTid]["peptide"], "mutated_position": mut_pos, "Kd": float(row_mut["affinity"]), "KdWT": float(WTdict[WTid]["affinity"]), @@ -514,7 +523,7 @@ def makeID(maf_row): "Frame_shift_Del": "I-", "In_Frame_Ins": "If", "In_Frame_Del": "Id", - "Splice_Site": "Sp", + "Splice_Site": "Sp" } position = int(str(maf_row["Start_Position"])[0:2]) @@ -564,15 +573,26 @@ def makeID(maf_row): ) else: - identifier_key = str(maf_row["Chromosome"]) + encoded_position + "_" + "SY" + Allele2code + "_M" + identifier_key = ( + str(maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + + "_M" + ) return identifier_key def parse_args(): parser = argparse.ArgumentParser(description="Process input files and parameters") parser.add_argument("--maf_file", required=True, help="Path to the MAF file") - parser.add_argument("--summary_file", required=True, help="Path to the summary file") - parser.add_argument("--mutation_file", required=True, help="Path to the mutation file") + parser.add_argument( + "--summary_file", required=True, help="Path to the summary file" + ) + parser.add_argument( + "--mutation_file", required=True, help="Path to the mutation file" + ) parser.add_argument( "--tree_directory", required=True, @@ -581,7 +601,9 @@ def parse_args(): parser.add_argument("--id", required=True, help="ID") parser.add_argument("--patient_id", required=True, help="Patient ID") parser.add_argument("--cohort", required=True, help="Cohort") - parser.add_argument("--HLA_genes", required=True, help="Path to the file containing HLA genes") + parser.add_argument( + "--HLA_genes", required=True, help="Path to the file containing HLA genes" + ) parser.add_argument( "--netMHCpan_MUT_input", required=True, @@ -597,7 +619,9 @@ def parse_args(): "--patient_data_file", help="Path to the optional file containing status, overall survival, and PFS", ) - parser.add_argument("-v", "--version", action="version", version="%(prog)s {}".format(VERSION)) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) return parser.parse_args() diff --git a/modules/msk/netmhcpan/main.nf b/modules/msk/netmhcpan/main.nf index 4f03154..bc54431 100644 --- a/modules/msk/netmhcpan/main.nf +++ b/modules/msk/netmhcpan/main.nf @@ -25,14 +25,25 @@ process NETMHCPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = false + def NETMHCPAN_VERSION = "4.1" """ - /usr/local/bin/netMHCpan-4.1/netMHCpan -s 0 -BA 1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 -xls -xlsfile ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output + /usr/local/bin/netMHCpan-${NETMHCPAN_VERSION}/netMHCpan \ + -s 0 \ + -BA 1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 \ + -xls \ + ${args} \ + -xlsfile \ + ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 + netmhcpan: v${NETMHCPAN_VERSION} END_VERSIONS """ @@ -40,6 +51,7 @@ process NETMHCPAN { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def NETMHCPAN_VERSION = "4.1" output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = false @@ -49,7 +61,7 @@ process NETMHCPAN { cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 + netmhcpan: v${NETMHCPAN_VERSION} END_VERSIONS """ } diff --git a/modules/msk/netmhcstabpan/main.nf b/modules/msk/netmhcstabpan/main.nf index df5b61a..316195f 100644 --- a/modules/msk/netmhcstabpan/main.nf +++ b/modules/msk/netmhcstabpan/main.nf @@ -25,14 +25,23 @@ process NETMHCSTABPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = true + + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + """ - /usr/local/bin/netMHCstabpan-1.0/netMHCstabpan -s -1 -f ${inputFasta} -a ${hla} -l 9,10 -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output + /usr/local/bin/netMHCstabpan-${NETMHCSTABPAN_VERSION}/netMHCstabpan \ + -s -1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 - netmhcstabpan: v1.0 + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} END_VERSIONS """ @@ -43,14 +52,17 @@ process NETMHCSTABPAN { output_meta = meta.clone() output_meta.typeMut = inputType == "MUT" ? true : false output_meta.fromStab = true + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + """ touch ${prefix}.MUT.netmhcstabpan.output cat <<-END_VERSIONS > versions.yml "${task.process}": - netmhcpan: v4.1 - netmhcstabpan: v1.0 + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} END_VERSIONS """ } diff --git a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py index b565290..e15092a 100755 --- a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py +++ b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py @@ -22,7 +22,9 @@ class ReadCountsUnavailableError(Exception): class MafVariant(object): - def __init__(self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele): + def __init__( + self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele + ): self.CHROM = chromosome self.POS = position self.FILTER = filter @@ -105,8 +107,12 @@ def _get_tumor_index(self, variant, tumor_sample=None): Might not always be true """ if self._tumor_sample: - tumor_is = [i for i, s in enumerate(variant.samples) if s.sample == tumor_sample] - assert len(tumor_is) == 1, "Did not find tumor name %s in samples" % tumor_sample + tumor_is = [ + i for i, s in enumerate(variant.samples) if s.sample == tumor_sample + ] + assert len(tumor_is) == 1, ( + "Did not find tumor name %s in samples" % tumor_sample + ) return tumor_is[0] else: # Don't make this -1, as some code assumes it will be >= 0. @@ -146,10 +152,14 @@ def _calc_read_counts(self, variant): }, } - ref_reads = tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt] + ref_reads = ( + tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt] + ) # For now, variant reads are defined as only the non-reference nucleotide in # the inferred tumor SNP. We ignore reads of a third or fourth base. - variant_reads = tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt] + variant_reads = ( + tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt] + ) total_reads = ref_reads + variant_reads return (ref_reads, total_reads) @@ -235,7 +245,9 @@ def _calc_read_counts(self, variant): total_reads = 0 variant_reads = 0 else: - variant_reads = int(getattr(variant.samples[tumor_i].data, str(alt) + "U")[0]) + variant_reads = int( + getattr(variant.samples[tumor_i].data, str(alt) + "U")[0] + ) ref_reads = total_reads - variant_reads return (ref_reads, total_reads) @@ -299,6 +311,7 @@ def _calc_read_counts(self, variant): class MafParser(MutectSmchetParser): + def _calc_read_counts(self, variant): total_reads = variant.T_REF + variant.T_ALT return (variant.T_REF, total_reads) @@ -323,7 +336,9 @@ def _parse_maf(self, maf_filename): pos = start if variant_type == "DEL": pos = pos - 1 - variant = MafVariant(chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele) + variant = MafVariant( + chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele + ) variant_list.append(variant) return variant_list @@ -433,7 +448,9 @@ def _format_cnvs(self, cnvs, variants): for chrom, chrom_cnvs in cnvs.items(): for cnv in chrom_cnvs: - overlapping_variants = self._find_overlapping_variants(chrom, cnv, variants) + overlapping_variants = self._find_overlapping_variants( + chrom, cnv, variants + ) total_reads = self._calc_total_reads(cnv["start"], cnv["end"]) ref_reads = self._calc_ref_reads(cnv["cell_prev"], total_reads) yield { @@ -477,12 +494,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity): for K in ("chrom", "start", "end", "major_cn", "minor_cn"): physical_cnvs[K] = cnv[K] - assert len(set(physical_cnvs["major_cn"])) == len(set(physical_cnvs["major_cn"])) == 1 + assert ( + len(set(physical_cnvs["major_cn"])) + == len(set(physical_cnvs["major_cn"])) + == 1 + ) physical_cnvs["major_cn"] = physical_cnvs["major_cn"][0] physical_cnvs["minor_cn"] = physical_cnvs["minor_cn"][0] - physical_cnvs["cell_prev"] = "|".join([str(C) for C in cnv["cellular_prevalence"]]) - cnv["physical_cnvs"] = ",".join(["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()]) + physical_cnvs["cell_prev"] = "|".join( + [str(C) for C in cnv["cellular_prevalence"]] + ) + cnv["physical_cnvs"] = ",".join( + ["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()] + ) merged, formatted = formatted[:1], formatted[1:] merged[0]["cnv_id"] = "c0" @@ -496,13 +521,20 @@ def format_and_merge_cnvs(self, cnvs, variants, cellularity): # Only merge CNVs if they're clonal. If they're subclonal, leave them # free to move around the tree. - if np.array_equal(current["cellular_prevalence"], last["cellular_prevalence"]) and np.array_equal( - last["cellular_prevalence"], cellularity - ): + if np.array_equal( + current["cellular_prevalence"], last["cellular_prevalence"] + ) and np.array_equal(last["cellular_prevalence"], cellularity): # Merge the CNVs. - log("Merging %s_%s and %s_%s" % (current["chrom"], current["start"], last["chrom"], last["start"])) - last["total_reads"] = self._cap_cnv_D(current["total_reads"] + last["total_reads"]) - last["ref_reads"] = self._calc_ref_reads(last["cellular_prevalence"], last["total_reads"]) + log( + "Merging %s_%s and %s_%s" + % (current["chrom"], current["start"], last["chrom"], last["start"]) + ) + last["total_reads"] = self._cap_cnv_D( + current["total_reads"] + last["total_reads"] + ) + last["ref_reads"] = self._calc_ref_reads( + last["cellular_prevalence"], last["total_reads"] + ) last["physical_cnvs"] += ";" + current["physical_cnvs"] self._merge_variants(last, current) else: @@ -534,7 +566,9 @@ def _calc_ref_freq(self, ref_genotype, error_rate): raise Exception("Nonsensical frequency: %s" % freq) return freq - def format_variants(self, variants, ref_read_counts, total_read_counts, error_rate, sex): + def format_variants( + self, variants, ref_read_counts, total_read_counts, error_rate, sex + ): for variant_idx, variant in enumerate(variants): ssm_id = "s%s" % self._counter if hasattr(variant, "ID") and variant.ID is not None: @@ -649,7 +683,10 @@ def _create_intervals(self, cnv_set): # some intervals revealed that the code crashes on this input. We # should provide a more informative error in such cases, which the # following assertion does. - assert cnv["start"] < cnv["end"], "In CNV %s, start position occurs at or after the end position" % cnv + assert cnv["start"] < cnv["end"], ( + "In CNV %s, start position occurs at or after the end position" + % cnv + ) start_pos = [ ( @@ -670,7 +707,9 @@ def _create_intervals(self, cnv_set): # True > False, so this sorting will place start positions after end # positions if both have same coordinate. - positions = sorted(start_pos + end_pos, key=lambda e: (e[0], e[1] == "start")) + positions = sorted( + start_pos + end_pos, key=lambda e: (e[0], e[1] == "start") + ) assert len(positions) >= 2, "Fewer than two positions in %s" % positions # prev_pos is updated each time we move to a new coordinate on the @@ -701,7 +740,9 @@ def _create_intervals(self, cnv_set): assert locus > prev_pos interval = (prev_pos, locus) if interval[1] - interval[0] > min_size_for_inclusion: - intervals[chrom].append((interval[0], interval[1], sorted(list(set(open_samples))))) + intervals[chrom].append( + (interval[0], interval[1], sorted(list(set(open_samples)))) + ) else: # All points should be start points. assert set([i[1] for i in points_at_locus]) == set(["start"]) @@ -831,7 +872,9 @@ def _get_abnormal_state_for_all_samples(self, chrom, cnv): abnormal_state = None filtered = [] - for sampidx, cell_prev, major, minor in zip(cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"]): + for sampidx, cell_prev, major, minor in zip( + cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"] + ): # Region may be (clonal or subclonal) normal in a sample, so ignore such records. if self._is_region_normal_cn(chrom, major, minor): continue @@ -908,12 +951,15 @@ def load_single_abnormal_state_cnvs(self): if not is_good_chrom(chrom): continue for cnv in chrom_cnvs: - states_for_all_samples = self._get_abnormal_state_for_all_samples(chrom, cnv) + states_for_all_samples = self._get_abnormal_state_for_all_samples( + chrom, cnv + ) if states_for_all_samples is None: continue combined_states = { - K: np.array([S[K] for S in states_for_all_samples]) for K in states_for_all_samples[0].keys() + K: np.array([S[K] for S in states_for_all_samples]) + for K in states_for_all_samples[0].keys() } cnv.update(combined_states) abnormal_cnvs[chrom].append(cnv) @@ -932,7 +978,9 @@ def load_normal_cnvs(self): if not is_good_chrom(chrom): continue for cnv in chrom_cnvs: - if not self._is_multisample_region_normal_cn(chrom, cnv["major_cn"], cnv["minor_cn"]): + if not self._is_multisample_region_normal_cn( + chrom, cnv["major_cn"], cnv["minor_cn"] + ): continue if not set(cnv["sampidx"]) == self.sampidxs: continue @@ -1081,7 +1129,9 @@ def retain_only_variants_in_normal_cn_regions(self): raise Exception("CN regions not yet provided") normal_cn = self._multisamp_cnv.load_normal_cnvs() - filtered = self._filter_variants_outside_regions(normal_cn, "all_variants", "only_normal_cn") + filtered = self._filter_variants_outside_regions( + normal_cn, "all_variants", "only_normal_cn" + ) def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): # Battenberg: @@ -1104,9 +1154,13 @@ def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): # If variant isn't listed in *any* region: exclude (as we suspect CNV # caller didn't know what to do with the region). - self._filter_variants_outside_regions(self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions") + self._filter_variants_outside_regions( + self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions" + ) - def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, sex): + def format_variants( + self, sample_size, error_rate, priority_ssms, only_priority, sex + ): if sample_size is None: sample_size = len(self._variant_idxs) random.shuffle(self._variant_idxs) @@ -1135,7 +1189,11 @@ def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, else: nonsubsampled.append(variant_idx) - assert len(used_variant_idxs) == len(self._variant_idxs) == len(subsampled) + len(nonsubsampled) + assert ( + len(used_variant_idxs) + == len(self._variant_idxs) + == len(subsampled) + len(nonsubsampled) + ) subsampled.sort(key=lambda idx: variant_key(self._variants[idx])) subsampled_variants = get_elements_at_indices(self._variants, subsampled) @@ -1174,7 +1232,9 @@ def write_variants(self, variants, outfn): print("\t".join(("id", "gene", "a", "d", "mu_r", "mu_v")), file=outf) for variant in variants: variant["ref_reads"] = ",".join([str(v) for v in variant["ref_reads"]]) - variant["total_reads"] = ",".join([str(v) for v in variant["total_reads"]]) + variant["total_reads"] = ",".join( + [str(v) for v in variant["total_reads"]] + ) vals = ( "ssm_id", "variant_name", @@ -1190,7 +1250,10 @@ def _estimate_read_depth(self): read_sum = 0 if len(self._variants) == 0: default_read_depth = 50 - log("No variants available, so fixing read depth at %s." % default_read_depth) + log( + "No variants available, so fixing read depth at %s." + % default_read_depth + ) return default_read_depth else: return np.nanmedian(self._total_read_counts, axis=0) @@ -1198,7 +1261,9 @@ def _estimate_read_depth(self): def write_cnvs(self, variants, outfn): with open(outfn, "w") as outf: print("\t".join(("cnv", "a", "d", "ssms", "physical_cnvs")), file=outf) - formatter = CnvFormatter(self._estimated_read_depth, self._sampidxs, self._hetsnp_rate) + formatter = CnvFormatter( + self._estimated_read_depth, self._sampidxs, self._hetsnp_rate + ) for cnv in formatter.format_and_merge_cnvs( self._multisamp_cnv.load_single_abnormal_state_cnvs(), variants, @@ -1298,7 +1363,9 @@ def impute_missing_total_reads(total_reads, missing_variant_confidence): assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0 # Convert 1D arrays to vectors to permit matrix multiplication. - imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1))) + imputed_counts = np.dot( + variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)) + ) nan_coords = np.where(np.isnan(total_reads)) total_reads[nan_coords] = imputed_counts[nan_coords] assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0 @@ -1331,7 +1398,9 @@ def is_good_chrom(chrom): return False -def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence): +def parse_variants( + samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence +): parsed_variants = [] all_variant_ids = [] num_samples = len(samples) @@ -1376,7 +1445,11 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_ ) ) else: - variant_ids.append(VariantId(str(single_variant[0].CHROM), int(single_variant[0].POS), None)) + variant_ids.append( + VariantId( + str(single_variant[0].CHROM), int(single_variant[0].POS), None + ) + ) all_variant_ids += variant_ids all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates. @@ -1398,7 +1471,9 @@ def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_ ref_read_counts[variant_idx, sample_idx] = ref_reads total_read_counts[variant_idx, sample_idx] = total_reads - total_read_counts = impute_missing_total_reads(total_read_counts, missing_variant_confidence) + total_read_counts = impute_missing_total_reads( + total_read_counts, missing_variant_confidence + ) ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts) return (all_variant_ids, ref_read_counts, total_read_counts) @@ -1411,7 +1486,9 @@ def infer_sex(variant_ids): return "female" -def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples): +def extract_sample_data( + vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples +): vcf_files = {} vcf_types = {} cnv_files = {} @@ -1426,7 +1503,9 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_ should_use_cnvs = cnv_files_and_samples is not None if should_use_cnvs: - assert len(cnv_files_and_samples) == len(vcf_files_and_samples), "Must specify same number of VCF and CNV files" + assert len(cnv_files_and_samples) == len( + vcf_files_and_samples + ), "Must specify same number of VCF and CNV files" srcs_and_dsts.append((cnv_files_and_samples, cnv_files)) for src, dst in srcs_and_dsts: @@ -1436,20 +1515,28 @@ def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_ dst[sample] = val # Sample order will dictate eventual output order. - common_samps = reduce(lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts]) + common_samps = reduce( + lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts] + ) ordered_samps = [S.split("=", 1)[0] for S in vcf_files_and_samples] assert len(ordered_samps) == len(common_samps) # Ensure no duplicates. - assert set(vcf_files.keys()) == common_samps, "VCF file samples (%s) differ from common samples (%s)" % ( + assert ( + set(vcf_files.keys()) == common_samps + ), "VCF file samples (%s) differ from common samples (%s)" % ( vcf_files.keys(), common_samps, ) - assert set(vcf_types.keys()) == common_samps, "VCF type samples (%s) differ from common samples (%s)" % ( + assert ( + set(vcf_types.keys()) == common_samps + ), "VCF type samples (%s) differ from common samples (%s)" % ( vcf_types.keys(), common_samps, ) if should_use_cnvs: - assert set(cnv_files.keys()) == common_samps, "CNV file samples (%s) differ from CNV file samples (%s)" % ( + assert ( + set(cnv_files.keys()) == common_samps + ), "CNV file samples (%s) differ from CNV file samples (%s)" % ( cnv_files.keys(), common_samps, ) @@ -1600,7 +1687,9 @@ def main(): log.verbose = args.verbose params = {} - samples, vcf_files, vcf_types, cnv_files = extract_sample_data(args.vcf_files, args.vcf_types, args.cnv_files) + samples, vcf_files, vcf_types, cnv_files = extract_sample_data( + args.vcf_files, args.vcf_types, args.cnv_files + ) params["samples"], params["vcf_files"], params["vcf_types"], params["cnv_files"] = ( samples, vcf_files, @@ -1634,7 +1723,9 @@ def main(): grouper.add_cnvs(cn_regions, sex) if not grouper.has_cnvs(): - assert args.regions == "all", "If you do not provide CNA data, you must specify --regions=all" + assert ( + args.regions == "all" + ), "If you do not provide CNA data, you must specify --regions=all" if args.regions == "normal_cn": grouper.retain_only_variants_in_normal_cn_regions() @@ -1660,8 +1751,13 @@ def main(): if grouper.has_cnvs() and args.regions != "normal_cn": # Write CNVs. grouper.write_cnvs(subsampled_vars, args.output_cnvs) - if args.output_nonsubsampled_variants and args.output_nonsubsampled_variants_cnvs: - grouper.write_cnvs(nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs) + if ( + args.output_nonsubsampled_variants + and args.output_nonsubsampled_variants_cnvs + ): + grouper.write_cnvs( + nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs + ) else: # Write empty CNV file. with open(args.output_cnvs, "w"): diff --git a/subworkflows/msk/netmhcstabandpan/main.nf b/subworkflows/msk/netmhcstabandpan/main.nf index d0425ef..ea8247c 100644 --- a/subworkflows/msk/netmhcstabandpan/main.nf +++ b/subworkflows/msk/netmhcstabandpan/main.nf @@ -91,13 +91,5 @@ def createNETMHCInput(wt_fasta, mut_fasta, hla) { new Tuple(it[1][0], it[1][1],it[2][1],"WT") } merged = merged_mut.mix(merged_wt) - - -// merged = merged_mut -// .join(merged_wt) -// .map{ -// new Tuple(it[1][0],it[1][1],it[1][2],it[1][3]) -// } - return merged } diff --git a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap index 0d3ce18..dbde65e 100644 --- a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap +++ b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap @@ -6,26 +6,15 @@ "test.MUT_sequences.fa:md5,7fdb7d3f0fe5a6f439ed294b612c2d70", "test.WT_sequences.fa:md5,7595ed6cf0c98500b00c9ad027125b38" ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-11T16:27:38.189273" + "timestamp": "2024-07-30T13:48:55.729458" }, "netmhcstabandpan - tsv,xls,fa - stub": { "content": [ - [ - "test.MUT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "test.WT.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ], + "test.WT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", "test.MUT.xls", "test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e", "test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-06-11T16:27:49.026711" + "timestamp": "2024-07-30T13:49:11.413783" } } \ No newline at end of file From 23e53baa54cc2cdd619d919e38631a4b56aa5e1e Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Date: Tue, 30 Jul 2024 15:11:03 -0400 Subject: [PATCH 2/2] Cleanup EpitopeDistance.py --- .../computefitness/resources/usr/bin/EpitopeDistance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py index ea09073..f278817 100755 --- a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py @@ -10,7 +10,7 @@ import json import os -#% + class EpitopeDistance(object): """Base class for epitope crossreactivity.