Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/mskcc/igo-demux
Browse files Browse the repository at this point in the history
  • Loading branch information
darrelln32 committed Apr 19, 2024
2 parents 13a544a + 7f974d8 commit f0b671f
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 23 deletions.
6 changes: 3 additions & 3 deletions scripts/LaunchMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo


launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
print(bsub_launch_dragen_rna)
call(bsub_launch_dragen_rna, shell = True)

Expand Down Expand Up @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas

metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
print(bsub_launch_dragen)
call(bsub_launch_dragen, shell = True)

Expand Down Expand Up @@ -300,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory):





23 changes: 16 additions & 7 deletions scripts/cellranger_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import os
import subprocess
import glob
from subprocess import call
import argparse
from collections import OrderedDict
import requests
Expand Down Expand Up @@ -115,6 +114,7 @@ def write_ch_ge_only_to_csv(self, name_of_file):
file.write("\n[libraries]\nfastq_id,fastqs,feature_types\n")

for key, value in self.lirbaries.items():
key = key.replace("_CHMARKER_", "")
if value[1] == "Gene Expression" or value[1] == "Multiplexing Capture":
for i in value[0]:
file.write("{},{},{}\n".format(key, i, value[1]))
Expand Down Expand Up @@ -191,7 +191,7 @@ def ch_file_generation(project_id, sample_name):
tag_seq_dict = pd.Series(df['Hashtag sequence'].values,index=df['Hashtag Name']).to_dict()

sub_sample_dict = {}
sub_sample_lst = df[df["Sample Name in IGO"] == sample_name]["Sample Name"].tolist()
sub_sample_lst = df[df["Sample Name in IGO"].astype(str) == str(sample_name)]["Sample Name"].tolist()
for item in sub_sample_lst:
sub_sample_dict[item] = sample_tag_dict[item]

Expand Down Expand Up @@ -235,8 +235,8 @@ def gather_config_info(sample_dict, genome, IGO_ID):
config.gene_expression["cmo-set"] = CONFIG_AREA + "Project_{}/Project_{}_ch_{}.csv".format(project_ID, project_ID, sample_name)
config.samples = ch_file_generation(project_ID, sample_name)

# if both ch and fb are there, change the ch name
if "ch" in sample_dict.keys() and "fb" in sample_dict.keys():
# if both ch and fb are there and vdj not there, change the ch name
if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and ("vdj" not in sample_dict.keys()):
sample_dict["ch"] = sample_dict["ch"].replace("FB_IGO", "CH_IGO")

# find fastq files for each sample and append information into config["libraries"]
Expand All @@ -245,14 +245,19 @@ def gather_config_info(sample_dict, genome, IGO_ID):
sample_list.append(i)
fastq_list = find_fastq_file(sample_list)
for key, value in sample_dict.items():
print("key: {}, value: {}".format(key, value))
if key == "ge":
config.lirbaries[value] = [fastq_list[value], "Gene Expression"]
elif key == "vdj":
config.lirbaries[value] = [fastq_list[value], "VDJ"]
elif key == "fb":
config.lirbaries[value] = [fastq_list[value], "Antibody Capture"]
elif key == "ch":
config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"]
# for case of all ch, fb and vdj exits and doesn't need to make two copies of fb fastq file
if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" in sample_dict.keys():
config.lirbaries[value + "_CHMARKER_"] = [fastq_list[value], "Multiplexing Capture"]
else:
config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"]

return config

Expand Down Expand Up @@ -401,7 +406,10 @@ def gather_sample_set_info(sample_name):
fb_type.append("Cell Hashing")
if "Feature Barcoding" in tag_lst:
fb_type.append("Feature Barcoding")
# TODO add vdj type
if "T Cells" in tag_lst:
vdj_type.append("VDJ-T")
if "B Cells" in tag_lst:
vdj_type.append("VDJ-B")
print(fb_type, vdj_type)
break

Expand All @@ -417,7 +425,7 @@ def gather_sample_set_info(sample_name):
sample_set["ch"] = "_IGO_".join([value[1], key])
if "10X_Genomics_VDJ" in value[2][0]:
sample_set["vdj"] = "_IGO_".join([value[1], key])

# TODO add vdj type to the whole pipeline
return sample_set

# TODO check whether a project set is complete to launch pipeline
Expand Down Expand Up @@ -447,6 +455,7 @@ def gather_sample_set_info(sample_name):

genome = args.genome
config = gather_config_info(sample_dict, genome, args.ge)
print(config.lirbaries)
project_ID = "_".join(args.ge.split("IGO_")[1].split("_")[:-1])
file_name = "{}Project_{}/{}.csv".format(CONFIG_AREA, project_ID, args.ge)

Expand Down
26 changes: 14 additions & 12 deletions scripts/get_total_reads_from_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy
import json
import re
import os

# get total reads number from Demultiplex_Stats.csv file or json file and generate txt files for each sample
# add DLP type function. For DLP, only total reads for each project is needed
Expand Down Expand Up @@ -98,22 +99,23 @@ def run(sample_sheet, sequencer_and_run):
print("generate AM txt files to folder: {}".format(stats_done_dir))

# generate AM txt files containing total reads by project ID such as "Project_12754_E"
def by_project(sample_sheet, project_id, sequencer_and_run):
def by_project_location(project_directory):
# get sample_ID list
sample_list_ori = os.listdir(project_directory)
sample_list = []
for sample in sample_list_ori:
# remove Sample_ prefix
sample_list.append(sample[7:])
# get run info from project_directory
sequencer_and_run = project_directory.split("/")[4]

sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
sequencer = sequencer_and_run.split("_")[0]
stats_done_dir = STATS_DONE_DIR_PREFIX + sequencer + "/"
demux_report_file = "/igo/staging/FASTQ/" + sequencer_and_run + "/Reports/Demultiplex_Stats.csv"
# dictionary of Sample_ID->Project
sample_project_dict = pd.Series(sample_sheet.df_ss_data['Sample_Project'].values,index=sample_sheet.df_ss_data['Sample_ID']).to_dict()

sample_ID_list = []
# filter sample_ID by projectID and append to sample_ID_list
for sample, project in sample_project_dict.items():
if project == project_id:
sample_ID_list.append(sample)

total_reads_dict = get_total_reads(sample_ID_list, demux_report_file)
for sample in sample_ID_list:

total_reads_dict = get_total_reads(sample_list, demux_report_file)
for sample in sample_list:
write_to_am_txt(sequencer_and_run_prefix, sample, total_reads_dict[sample], stats_done_dir)

print("generate AM txt files to folder: {}".format(stats_done_dir))
Expand Down
2 changes: 1 addition & 1 deletion scripts/organise_fastq_split_by_lane.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def correct_fastq_list_csv(demux_reports_dir):
demux_dir = sys.argv[2]
if demux_type == "create":
create_fastq_folders(demux_dir)
# add correct fastq list step?
correct_fastq_list_csv(demux_dir+"/Reports")
elif demux_type == "correct":
correct_sample_folder_name(demux_dir)
else:
Expand Down
3 changes: 3 additions & 0 deletions stats_by_project_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def run_stats(ds, **kwargs):
import subprocess
import scripts.cellranger_multi
import os
import scripts.get_total_reads_from_demux

project_directory = kwargs["params"]["project_directory"]
recipe = kwargs["params"]["recipe"]
Expand Down Expand Up @@ -62,6 +63,8 @@ def run_stats(ds, **kwargs):
cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory)
print(cmd)
subprocess.run(cmd, shell=True)
elif recipe == "demux_stats":
scripts.get_total_reads_from_demux.by_project_location(project_directory)
else:
scripts.calculate_stats.main([project_directory, recipe, species])

Expand Down

0 comments on commit f0b671f

Please sign in to comment.