diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 1fcb676..8f993dc 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) @@ -300,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory): - \ No newline at end of file + diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 3bead5c..dae2b40 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -2,7 +2,6 @@ import os import subprocess import glob -from subprocess import call import argparse from collections import OrderedDict import requests @@ -115,6 +114,7 @@ def write_ch_ge_only_to_csv(self, name_of_file): file.write("\n[libraries]\nfastq_id,fastqs,feature_types\n") for key, value in self.lirbaries.items(): + key = key.replace("_CHMARKER_", "") if value[1] == "Gene Expression" or value[1] == "Multiplexing Capture": for i in value[0]: file.write("{},{},{}\n".format(key, i, value[1])) @@ -191,7 +191,7 @@ def ch_file_generation(project_id, sample_name): tag_seq_dict = pd.Series(df['Hashtag sequence'].values,index=df['Hashtag Name']).to_dict() sub_sample_dict = {} - sub_sample_lst = df[df["Sample Name in IGO"] == sample_name]["Sample Name"].tolist() + sub_sample_lst = df[df["Sample Name in IGO"].astype(str) == str(sample_name)]["Sample Name"].tolist() for item in sub_sample_lst: sub_sample_dict[item] = sample_tag_dict[item] @@ -235,8 +235,8 @@ def gather_config_info(sample_dict, genome, IGO_ID): config.gene_expression["cmo-set"] = CONFIG_AREA + "Project_{}/Project_{}_ch_{}.csv".format(project_ID, project_ID, sample_name) config.samples = ch_file_generation(project_ID, sample_name) - # if both ch and fb are there, change the ch name - if "ch" in sample_dict.keys() and "fb" in sample_dict.keys(): + # if both ch and fb are there and vdj not there, change the ch name + if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and ("vdj" not in sample_dict.keys()): sample_dict["ch"] = sample_dict["ch"].replace("FB_IGO", "CH_IGO") # find fastq files for each sample and append information into config["libraries"] @@ -245,6 +245,7 @@ def gather_config_info(sample_dict, genome, IGO_ID): sample_list.append(i) fastq_list = find_fastq_file(sample_list) for key, value in sample_dict.items(): + print("key: {}, value: {}".format(key, value)) if key == "ge": config.lirbaries[value] = [fastq_list[value], "Gene Expression"] elif key == "vdj": @@ -252,7 +253,11 @@ def gather_config_info(sample_dict, genome, IGO_ID): elif key == "fb": config.lirbaries[value] = [fastq_list[value], "Antibody Capture"] elif key == "ch": - config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"] + # for case of all ch, fb and vdj exits and doesn't need to make two copies of fb fastq file + if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" in sample_dict.keys(): + config.lirbaries[value + "_CHMARKER_"] = [fastq_list[value], "Multiplexing Capture"] + else: + config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"] return config @@ -401,7 +406,10 @@ def gather_sample_set_info(sample_name): fb_type.append("Cell Hashing") if "Feature Barcoding" in tag_lst: fb_type.append("Feature Barcoding") - # TODO add vdj type + if "T Cells" in tag_lst: + vdj_type.append("VDJ-T") + if "B Cells" in tag_lst: + vdj_type.append("VDJ-B") print(fb_type, vdj_type) break @@ -417,7 +425,7 @@ def gather_sample_set_info(sample_name): sample_set["ch"] = "_IGO_".join([value[1], key]) if "10X_Genomics_VDJ" in value[2][0]: sample_set["vdj"] = "_IGO_".join([value[1], key]) - + # TODO add vdj type to the whole pipeline return sample_set # TODO check whether a project set is complete to launch pipeline @@ -447,6 +455,7 @@ def gather_sample_set_info(sample_name): genome = args.genome config = gather_config_info(sample_dict, genome, args.ge) + print(config.lirbaries) project_ID = "_".join(args.ge.split("IGO_")[1].split("_")[:-1]) file_name = "{}Project_{}/{}.csv".format(CONFIG_AREA, project_ID, args.ge) diff --git a/scripts/get_total_reads_from_demux.py b/scripts/get_total_reads_from_demux.py index 612ef5e..9cf02b8 100644 --- a/scripts/get_total_reads_from_demux.py +++ b/scripts/get_total_reads_from_demux.py @@ -3,6 +3,7 @@ import numpy import json import re +import os # get total reads number from Demultiplex_Stats.csv file or json file and generate txt files for each sample # add DLP type function. For DLP, only total reads for each project is needed @@ -98,22 +99,23 @@ def run(sample_sheet, sequencer_and_run): print("generate AM txt files to folder: {}".format(stats_done_dir)) # generate AM txt files containing total reads by project ID such as "Project_12754_E" -def by_project(sample_sheet, project_id, sequencer_and_run): +def by_project_location(project_directory): + # get sample_ID list + sample_list_ori = os.listdir(project_directory) + sample_list = [] + for sample in sample_list_ori: + # remove Sample_ prefix + sample_list.append(sample[7:]) + # get run info from project_directory + sequencer_and_run = project_directory.split("/")[4] + sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3]) sequencer = sequencer_and_run.split("_")[0] stats_done_dir = STATS_DONE_DIR_PREFIX + sequencer + "/" demux_report_file = "/igo/staging/FASTQ/" + sequencer_and_run + "/Reports/Demultiplex_Stats.csv" - # dictionary of Sample_ID->Project - sample_project_dict = pd.Series(sample_sheet.df_ss_data['Sample_Project'].values,index=sample_sheet.df_ss_data['Sample_ID']).to_dict() - - sample_ID_list = [] - # filter sample_ID by projectID and append to sample_ID_list - for sample, project in sample_project_dict.items(): - if project == project_id: - sample_ID_list.append(sample) - - total_reads_dict = get_total_reads(sample_ID_list, demux_report_file) - for sample in sample_ID_list: + + total_reads_dict = get_total_reads(sample_list, demux_report_file) + for sample in sample_list: write_to_am_txt(sequencer_and_run_prefix, sample, total_reads_dict[sample], stats_done_dir) print("generate AM txt files to folder: {}".format(stats_done_dir)) diff --git a/scripts/organise_fastq_split_by_lane.py b/scripts/organise_fastq_split_by_lane.py index f0d81a7..bb7c108 100644 --- a/scripts/organise_fastq_split_by_lane.py +++ b/scripts/organise_fastq_split_by_lane.py @@ -92,7 +92,7 @@ def correct_fastq_list_csv(demux_reports_dir): demux_dir = sys.argv[2] if demux_type == "create": create_fastq_folders(demux_dir) - # add correct fastq list step? + correct_fastq_list_csv(demux_dir+"/Reports") elif demux_type == "correct": correct_sample_folder_name(demux_dir) else: diff --git a/stats_by_project_dag.py b/stats_by_project_dag.py index 2f21d2a..b99dc9e 100644 --- a/stats_by_project_dag.py +++ b/stats_by_project_dag.py @@ -23,6 +23,7 @@ def run_stats(ds, **kwargs): import subprocess import scripts.cellranger_multi import os + import scripts.get_total_reads_from_demux project_directory = kwargs["params"]["project_directory"] recipe = kwargs["params"]["recipe"] @@ -62,6 +63,8 @@ def run_stats(ds, **kwargs): cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory) print(cmd) subprocess.run(cmd, shell=True) + elif recipe == "demux_stats": + scripts.get_total_reads_from_demux.by_project_location(project_directory) else: scripts.calculate_stats.main([project_directory, recipe, species])