From 8074bea6cac878918c9ae7884792dbfb4f314a14 Mon Sep 17 00:00:00 2001
From: David McManamon <dmcmanam@gmail.com>
Date: Thu, 12 Sep 2024 09:24:20 -0400
Subject: [PATCH 1/4] Added ONT flowcell to the output summary.csv

---
 scripts/ont_stats.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py
index aeb384c..81c2be5 100644
--- a/scripts/ont_stats.py
+++ b/scripts/ont_stats.py
@@ -4,6 +4,7 @@
 import glob
 import os
 from collections import OrderedDict
+import re
 
 # TODO get barcode info from lims
 # check if the run is pooled
@@ -14,7 +15,7 @@ def if_pooled(sequencing_summary_df):
     return pooled
 
 # get stats metric if the run is not pooled
-def get_read_length_and_summary(sequencing_summary_df):
+def get_read_length_and_summary(sequencing_summary_df, flowcell):
     read_length = sequencing_summary_df[sequencing_summary_df["passes_filtering"]]["sequence_length_template"].tolist()
     if len(read_length) != 0:
         read_length.sort(reverse = True)
@@ -30,10 +31,10 @@ def get_read_length_and_summary(sequencing_summary_df):
         median = 0
         N50_value = 0
         N50 = 0
-    return(len(read_length), N50_value * 2 / 1000000000, N50, median)
+    return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell)
 
 # get stats metric if the run is pooled
-def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name):
+def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell):
     sample_dict = {}
     samples = sequencing_summary_df["barcode_arrangement"].unique()
     for sample in samples:
@@ -45,13 +46,21 @@ def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name):
             sample_dict[sample_sub] = get_read_length_and_summary(sample_df)
     return sample_dict
 
+def extract_flowcell(text):
+    # Regular expression to match the characters after 'sequencing_summary_' and before the next '_'
+    match = re.search(r'sequencing_summary_([^_]+)', text)
+    if match:
+        return match.group(1)
+    else:
+        return None
+
 def write_to_csv(sample_dict):
     file_name = "summary.csv"
     print("Writing stats file: " + file_name)
     with open(file_name,'w') as file:
-        file.write("sample_id, Reads, Bases, N50, Meidan Read Length\n")
+        file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell\n")
         for key, value in sample_dict.items():
-            file.write("{}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3]))
+            file.write("{}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4]))
 
 if __name__ == '__main__':
     # Usage: python ont_stats.py [project_directory]
@@ -70,16 +79,19 @@ def write_to_csv(sample_dict):
             file_count = 0
             for i in file:
                 file_count += 1
+                flowcell = extract_flowcell(i)
+                print("Processing file: " + i + " from flowcell: " + flowcell)
                 summary_matrix = pd.read_csv(i, delimiter = "\t")
                 pooled = if_pooled(summary_matrix)
                 # give different sample name for multi runs on one flow cell
                 if file_count != 1:
                     sample = sample + "_" + str(file_count)
                 if pooled:
-                    sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample)
+                    sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell)
                     sample_dict.update(sample_dict_sub)
                 else:
-                    sample_dict[sample] = get_read_length_and_summary(summary_matrix)
+                    sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell)
+                print(sample_dict)
 
     write_to_csv(sample_dict)
     print("ONT stats complete for: " + project_directory)

From fc2c1eef87e102750a3e90ce7261d92d2fcdd10f Mon Sep 17 00:00:00 2001
From: darrelln32 <darrelln399@gmail.com>
Date: Tue, 17 Sep 2024 09:46:27 -0400
Subject: [PATCH 2/4] Update LaunchMetrics.py

adding option to use DRAGEN vcf file when aligning fastq data to human  genome using DRAGEN
---
 scripts/LaunchMetrics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py
index 6f3c9f7..64e0e0d 100644
--- a/scripts/LaunchMetrics.py
+++ b/scripts/LaunchMetrics.py
@@ -176,11 +176,13 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas
 		# get the correct path for the reference
 		if (sample_parameters["GTAG"] == "GRCh38"):
 			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1"
+			vcfFileOption = "--qc-cross-cont-vcf /opt/edico/config/sample_cross_contamination_resource_hg38.vcf.gz"
 		else:
 			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"])
+			vcfFileOption = ""
 			
 		metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
-		launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
+		launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000 {}".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id, vcfFileOption)
 		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
 		print(bsub_launch_dragen)
 		call(bsub_launch_dragen, shell = True)

From 22297a74e93a147963a84211fc9115697a8bf4af Mon Sep 17 00:00:00 2001
From: luc <cl3262@nyu.edu>
Date: Thu, 19 Sep 2024 11:05:43 -0400
Subject: [PATCH 3/4] add chip position to stats

---
 scripts/ont_stats.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py
index 81c2be5..61ccec8 100644
--- a/scripts/ont_stats.py
+++ b/scripts/ont_stats.py
@@ -15,7 +15,7 @@ def if_pooled(sequencing_summary_df):
     return pooled
 
 # get stats metric if the run is not pooled
-def get_read_length_and_summary(sequencing_summary_df, flowcell):
+def get_read_length_and_summary(sequencing_summary_df, flowcell, position):
     read_length = sequencing_summary_df[sequencing_summary_df["passes_filtering"]]["sequence_length_template"].tolist()
     if len(read_length) != 0:
         read_length.sort(reverse = True)
@@ -31,19 +31,19 @@ def get_read_length_and_summary(sequencing_summary_df, flowcell):
         median = 0
         N50_value = 0
         N50 = 0
-    return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell)
+    return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell, position)
 
 # get stats metric if the run is pooled
-def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell):
+def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell, position):
     sample_dict = {}
     samples = sequencing_summary_df["barcode_arrangement"].unique()
     for sample in samples:
         sample_df = sequencing_summary_df.loc[sequencing_summary_df['barcode_arrangement'] == sample]
         sample_sub = sample_name + "_" + sample
-        stats = get_read_length_and_summary(sample_df)
+        stats = get_read_length_and_summary(sample_df, flowcell, position)
         # only record barcodes with more than 10000 reads
         if stats[0] > 10000:
-            sample_dict[sample_sub] = get_read_length_and_summary(sample_df)
+            sample_dict[sample_sub] = get_read_length_and_summary(sample_df, flowcell, position)
     return sample_dict
 
 def extract_flowcell(text):
@@ -58,9 +58,9 @@ def write_to_csv(sample_dict):
     file_name = "summary.csv"
     print("Writing stats file: " + file_name)
     with open(file_name,'w') as file:
-        file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell\n")
+        file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell, Position\n")
         for key, value in sample_dict.items():
-            file.write("{}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4]))
+            file.write("{}, {}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4], value[5]))
 
 if __name__ == '__main__':
     # Usage: python ont_stats.py [project_directory]
@@ -79,18 +79,19 @@ def write_to_csv(sample_dict):
             file_count = 0
             for i in file:
                 file_count += 1
+                position = i.split("/")[-2].split("_")[2]
                 flowcell = extract_flowcell(i)
-                print("Processing file: " + i + " from flowcell: " + flowcell)
+                print("Processing file: " + i + " from flowcell: " + flowcell + " at position:" + position)
                 summary_matrix = pd.read_csv(i, delimiter = "\t")
                 pooled = if_pooled(summary_matrix)
                 # give different sample name for multi runs on one flow cell
                 if file_count != 1:
                     sample = sample + "_" + str(file_count)
                 if pooled:
-                    sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell)
+                    sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell, position)
                     sample_dict.update(sample_dict_sub)
                 else:
-                    sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell)
+                    sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell, position)
                 print(sample_dict)
 
     write_to_csv(sample_dict)

From b4ab63c55385e28988f85c795b6feccfd724b29e Mon Sep 17 00:00:00 2001
From: darrelln32 <darrelln399@gmail.com>
Date: Tue, 24 Sep 2024 13:05:29 -0400
Subject: [PATCH 4/4] Update run_param_config.py

added new genome Nakaseomyces glabratus to run_param_config
---
 scripts/run_param_config.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py
index 80ec285..075f564 100644
--- a/scripts/run_param_config.py
+++ b/scripts/run_param_config.py
@@ -131,6 +131,7 @@ def get_ordered_dic(unordered_dic):
 		"E.Lambda": "elambda",
 		"Plasmid": "ecolik12",
 		"Pig": "sscrofa11",
+		"N. glabratus": "cbs138",
 		# FOR NEW ENTRIES
 		# "{regex}": "{GENOME}"
 	
@@ -324,6 +325,20 @@ def get_ordered_dic(unordered_dic):
 						GTAG: "grcz11"
 				}
 		},
+		"cbs138": {
+				DEFAULT: {
+						GENOME: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.fa",
+						REFERENCE: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.fa"
+				},
+				"RNA": {
+						GENOME: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.fa",
+						REFERENCE: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.fa",
+						REF_FLAT: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.CLEAN.gtf.refflat",
+						RIBOSOMAL_INTERVALS: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.CLEAN.gtf.rRNA.intervals",
+						GTF: "/igo/work/nabors/genomes/N.glabratus/GCF/GCF_010111755_1_ASM1011175v1_genomic.CLEAN.gtf",
+						GTAG: "cbs138"
+				}
+		},
 		"ce10": {
 				DEFAULT: {
 						GENOME: "/igo/work/genomes/C.elegans/ce10/BWA_0.7.5a/ce10.fa",