Skip to content

Commit

Permalink
Merge pull request #27 from ryanjameskennedy/26-add-sequencing_run-an…
Browse files Browse the repository at this point in the history
…d-clarity_sample_id

Add sequencing_run and clarity_sample_id to missing subarg output
  • Loading branch information
ryanjameskennedy authored Mar 16, 2024
2 parents 1d7d644 + e8e2b31 commit c9b731e
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 17 deletions.
44 changes: 33 additions & 11 deletions jasentool/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ def edit_read_paths(reads, restore_dir):
read1, read2 = [filename.rstrip(".spring") + f"_R{i}_001.fastq.gz" for i in [1, 2]]
return os.path.join(restore_dir, reads.split("BaseCalls/")[1]), [read1, read2]

@staticmethod
def get_seqrun_from_filepath(filepath):
dirs = filepath.split("/")
pattern = r'^\d{6}' # Regular expression pattern for YYMMDD format
for dir in dirs:
match = re.search(pattern, dir)
if match:
return dir
return None

@staticmethod
def check_file_cp(reads, restore_dir):
"""Check that file not already coppied to restore directory"""
Expand Down Expand Up @@ -70,20 +80,22 @@ def check_file_cp(reads, restore_dir):
def parse_sample_sheet(sample_sheet, restore_dir):
"""Parse sample sheets for sample meta data"""
csv_dict = {}
seqrun = Missing.get_seqrun_from_filepath(sample_sheet)
with open(sample_sheet, "r", encoding="utf-8") as fin:
for line in fin:
if line.endswith("saureus\n"):
line = line.rstrip()
sample_id = line.split(",")[-1].split("_")[1]
species = line.split(",")[-1].split("_")[2]
try:
clarity_id = line.split(",")[0].split(":")[1]
clarity_sample_meta = line.split(",")[0].split(":")[1]
except IndexError:
clarity_id = line.split(",")[0]
clarity_sample_meta = line.split(",")[0]
try:
clarity_group_id = clarity_id.split("_")[1]
clarity_group_id = clarity_sample_meta.split("_")[1]
except IndexError:
clarity_group_id = clarity_id
clarity_group_id = clarity_sample_meta
clarity_sample_id = clarity_sample_meta.split("_")[0]
if ":" in line:
parent_dir = os.path.join(
line.split(":")[0].rstrip("SampleSheet.csv"),
Expand All @@ -95,12 +107,14 @@ def parse_sample_sheet(sample_sheet, restore_dir):
"Data/Intensities/BaseCalls/"
)
try:
paired_reads = Missing.find_files(r'^' + clarity_id, parent_dir)
paired_reads = Missing.find_files(r'^' + clarity_sample_id, parent_dir)
if len(paired_reads) == 2 and paired_reads[0].endswith(".gz"):
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -113,8 +127,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
[restore_dir]*len(spring_fpaths)
))[0]
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
paired_reads,
spring_fpaths,
restored_spring_fpaths
Expand All @@ -124,8 +140,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if len(paired_reads) == 2:
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -140,8 +158,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if paired_read.endswith(".fastq.gz")]
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -151,8 +171,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if paired_read.endswith(".fastq.gz")]
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand Down Expand Up @@ -253,14 +275,14 @@ def create_bash_script(csv_dict, restore_dir):
jcp_command = ""
unspring_command = ""
try:
spring_fpaths, restored_fpaths = csv_dict[sample][3][0], csv_dict[sample][4]
read1, _ = csv_dict[sample][2]
spring_fpaths, restored_fpaths = csv_dict[sample][5][0], csv_dict[sample][6]
read1, _ = csv_dict[sample][4]
if not os.path.exists(restored_fpaths) and not os.path.exists(read1):
jcp_command = f'/fs2/sw/bnf-scripts/jcp {spring_fpaths} {restore_dir}/ && '
unspring_command = f'/fs2/sw/bnf-scripts/unspring_file.pl {restored_fpaths} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n'
spring_command = spring_command + jcp_command + unspring_command
except TypeError:
for read_fpath in csv_dict[sample][4]:
for read_fpath in csv_dict[sample][6]:
jcp_command = f'/fs2/sw/bnf-scripts/jcp {read_fpath} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n'
spring_command = spring_command + jcp_command
bash_script = shell_script_path + shell_fail_count + spring_command + shell_for_loop
Expand All @@ -272,12 +294,12 @@ def remove_empty_files(csv_dict):
empty_files_dict = {}
for sample in csv_dict:
try:
file_size_r1 = os.path.getsize(csv_dict[sample][2][0]) / (1024 * 1024)
file_size_r2 = os.path.getsize(csv_dict[sample][2][1]) / (1024 * 1024)
file_size_r1 = os.path.getsize(csv_dict[sample][4][0]) / (1024 * 1024)
file_size_r2 = os.path.getsize(csv_dict[sample][4][1]) / (1024 * 1024)
if file_size_r1 < 10 or file_size_r2 < 10:
empty_files_dict[sample] = csv_dict[sample]
except FileNotFoundError:
print(f"WARN: {sample} read files ({csv_dict[sample][2][0]} and/or {csv_dict[sample][2][1]}) could not be found!")
print(f"WARN: {sample} read files ({csv_dict[sample][4][0]} and/or {csv_dict[sample][4][1]}) could not be found!")
except IndexError:
print(csv_dict[sample])
for empty_file in list(empty_files_dict.keys()):
Expand Down
13 changes: 8 additions & 5 deletions jasentool/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ class Utils:
def write_out_csv(csv_dict, assay, platform, out_fpath):
"""Write out file as csv"""
with open(out_fpath, 'w+', encoding="utf-8") as csvfile:
fieldnames = ["id", "group", "species", "assay", "platform", "read1", "read2"] #header
fieldnames = ["id", "clarity_sample_id", "group", "species", "assay",
"platform", "sequencing_run", "read1", "read2"] #header
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for sample in csv_dict:
row_dict = {"id":sample, "group": csv_dict[sample][0],
"species": csv_dict[sample][1], "assay": assay,
"platform": platform, "read1": csv_dict[sample][2][0],
"read2": csv_dict[sample][2][1]} #write rows to CSV
row_dict = {"id": sample, "clarity_sample_id": csv_dict[sample][0],
"group": csv_dict[sample][1], "species": csv_dict[sample][2],
"assay": assay, "platform": platform,
"sequencing_run": csv_dict[sample][3],
"read1": csv_dict[sample][4][0],
"read2": csv_dict[sample][4][1]} #write rows to CSV
writer.writerow(row_dict)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
"requests",
"tqdm",
"pandas",
"pymongo",
"pymongo==3.13",
"openpyxl",
"biopython"
]
Expand Down

0 comments on commit c9b731e

Please sign in to comment.