Merge branch 'main' of https://github.com/mskcc/igo-demux

mskcc · Apr 19, 2024 · f0b671f · f0b671f
2 parents 13a544a + 7f974d8
commit f0b671f
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 23 deletions.
diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py
@@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo
 
 
 		launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
-		bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
+		bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
 		print(bsub_launch_dragen_rna)
 		call(bsub_launch_dragen_rna, shell = True)
 
@@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas
 
 		metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
 		launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
-		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
+		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
 		print(bsub_launch_dragen)
 		call(bsub_launch_dragen, shell = True)
 
@@ -300,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory):
 
 
 
-
+
diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py
@@ -2,7 +2,6 @@
 import os
 import subprocess
 import glob
-from subprocess import call
 import argparse
 from collections import OrderedDict
 import requests
@@ -115,6 +114,7 @@ def write_ch_ge_only_to_csv(self, name_of_file):
             file.write("\n[libraries]\nfastq_id,fastqs,feature_types\n")
 
             for key, value in self.lirbaries.items():
+                key = key.replace("_CHMARKER_", "")
                 if value[1] == "Gene Expression" or value[1] == "Multiplexing Capture":
                     for i in value[0]:
                         file.write("{},{},{}\n".format(key, i, value[1]))
@@ -191,7 +191,7 @@ def ch_file_generation(project_id, sample_name):
     tag_seq_dict = pd.Series(df['Hashtag sequence'].values,index=df['Hashtag Name']).to_dict()
 
     sub_sample_dict = {}
-    sub_sample_lst = df[df["Sample Name in IGO"] == sample_name]["Sample Name"].tolist()
+    sub_sample_lst = df[df["Sample Name in IGO"].astype(str) == str(sample_name)]["Sample Name"].tolist()
     for item in sub_sample_lst:
         sub_sample_dict[item] = sample_tag_dict[item]
 
@@ -235,8 +235,8 @@ def gather_config_info(sample_dict, genome, IGO_ID):
         config.gene_expression["cmo-set"] = CONFIG_AREA + "Project_{}/Project_{}_ch_{}.csv".format(project_ID, project_ID, sample_name)
         config.samples = ch_file_generation(project_ID, sample_name)
 
-    # if both ch and fb are there, change the ch name
-    if "ch" in sample_dict.keys() and "fb" in sample_dict.keys():
+    # if both ch and fb are there and vdj not there, change the ch name
+    if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and ("vdj" not in sample_dict.keys()):
         sample_dict["ch"] = sample_dict["ch"].replace("FB_IGO", "CH_IGO")
 
     # find fastq files for each sample and append information into config["libraries"]
@@ -245,14 +245,19 @@ def gather_config_info(sample_dict, genome, IGO_ID):
         sample_list.append(i)
     fastq_list = find_fastq_file(sample_list)
     for key, value in sample_dict.items():
+        print("key: {}, value: {}".format(key, value))
         if key == "ge":
             config.lirbaries[value] = [fastq_list[value], "Gene Expression"]
         elif key == "vdj":
             config.lirbaries[value] = [fastq_list[value], "VDJ"]
         elif key == "fb":
             config.lirbaries[value] = [fastq_list[value], "Antibody Capture"]
         elif key == "ch":
-            config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"]
+            # for case of all ch, fb and vdj exits and doesn't need to make two copies of fb fastq file
+            if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" in sample_dict.keys():
+                config.lirbaries[value + "_CHMARKER_"] = [fastq_list[value], "Multiplexing Capture"]
+            else:
+                config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"]
 
     return config
 
@@ -401,7 +406,10 @@ def gather_sample_set_info(sample_name):
                     fb_type.append("Cell Hashing")
                 if "Feature Barcoding" in tag_lst:
                     fb_type.append("Feature Barcoding")
-                # TODO add vdj type
+                if "T Cells" in tag_lst:
+                    vdj_type.append("VDJ-T")
+                if "B Cells" in tag_lst:
+                    vdj_type.append("VDJ-B")
                 print(fb_type, vdj_type)
                 break
 
@@ -417,7 +425,7 @@ def gather_sample_set_info(sample_name):
                         sample_set["ch"] = "_IGO_".join([value[1], key])
                 if "10X_Genomics_VDJ" in value[2][0]:
                     sample_set["vdj"] = "_IGO_".join([value[1], key])
-
+    # TODO add vdj type to the whole pipeline
     return sample_set
 
 # TODO check whether a project set is complete to launch pipeline
@@ -447,6 +455,7 @@ def gather_sample_set_info(sample_name):
 
     genome = args.genome
     config = gather_config_info(sample_dict, genome, args.ge)
+    print(config.lirbaries)
     project_ID = "_".join(args.ge.split("IGO_")[1].split("_")[:-1])
     file_name = "{}Project_{}/{}.csv".format(CONFIG_AREA, project_ID, args.ge)
 

diff --git a/scripts/get_total_reads_from_demux.py b/scripts/get_total_reads_from_demux.py
@@ -3,6 +3,7 @@
 import numpy
 import json
 import re
+import os
 
 # get total reads number from Demultiplex_Stats.csv file or json file and generate txt files for each sample
 # add DLP type function. For DLP, only total reads for each project is needed
@@ -98,22 +99,23 @@ def run(sample_sheet, sequencer_and_run):
     print("generate AM txt files to folder: {}".format(stats_done_dir))
 
 # generate AM txt files containing total reads by project ID such as "Project_12754_E"
-def by_project(sample_sheet, project_id, sequencer_and_run):
+def by_project_location(project_directory):
+    # get sample_ID list
+    sample_list_ori = os.listdir(project_directory)
+    sample_list = []
+    for sample in sample_list_ori:
+        # remove Sample_ prefix
+        sample_list.append(sample[7:])
+    # get run info from project_directory
+    sequencer_and_run = project_directory.split("/")[4]
+
     sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
     sequencer = sequencer_and_run.split("_")[0]
     stats_done_dir = STATS_DONE_DIR_PREFIX + sequencer + "/"
     demux_report_file = "/igo/staging/FASTQ/" + sequencer_and_run + "/Reports/Demultiplex_Stats.csv"
-    # dictionary of Sample_ID->Project
-    sample_project_dict = pd.Series(sample_sheet.df_ss_data['Sample_Project'].values,index=sample_sheet.df_ss_data['Sample_ID']).to_dict()
-
-    sample_ID_list = []
-    # filter sample_ID by projectID and append to sample_ID_list
-    for sample, project in sample_project_dict.items():
-        if project == project_id:
-            sample_ID_list.append(sample)
-
-    total_reads_dict = get_total_reads(sample_ID_list, demux_report_file)
-    for sample in sample_ID_list:
+
+    total_reads_dict = get_total_reads(sample_list, demux_report_file)
+    for sample in sample_list:
         write_to_am_txt(sequencer_and_run_prefix, sample, total_reads_dict[sample], stats_done_dir)
 
     print("generate AM txt files to folder: {}".format(stats_done_dir))

diff --git a/scripts/organise_fastq_split_by_lane.py b/scripts/organise_fastq_split_by_lane.py
@@ -92,7 +92,7 @@ def correct_fastq_list_csv(demux_reports_dir):
     demux_dir = sys.argv[2]
     if demux_type == "create":
         create_fastq_folders(demux_dir)
-        # add correct fastq list step?
+        correct_fastq_list_csv(demux_dir+"/Reports")
     elif demux_type == "correct":
         correct_sample_folder_name(demux_dir)
     else:

diff --git a/stats_by_project_dag.py b/stats_by_project_dag.py
@@ -23,6 +23,7 @@ def run_stats(ds, **kwargs):
         import subprocess
         import scripts.cellranger_multi
         import os
+        import scripts.get_total_reads_from_demux
 
         project_directory = kwargs["params"]["project_directory"]
         recipe = kwargs["params"]["recipe"]
@@ -62,6 +63,8 @@ def run_stats(ds, **kwargs):
             cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory)
             print(cmd)
             subprocess.run(cmd, shell=True)
+        elif recipe == "demux_stats":
+            scripts.get_total_reads_from_demux.by_project_location(project_directory)
         else:
             scripts.calculate_stats.main([project_directory, recipe, species])