Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sequencing_run and clarity_sample_id to missing subarg output #27

Merged
merged 3 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions jasentool/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ def edit_read_paths(reads, restore_dir):
read1, read2 = [filename.rstrip(".spring") + f"_R{i}_001.fastq.gz" for i in [1, 2]]
return os.path.join(restore_dir, reads.split("BaseCalls/")[1]), [read1, read2]

@staticmethod
def get_seqrun_from_filepath(filepath):
dirs = filepath.split("/")
pattern = r'^\d{6}' # Regular expression pattern for YYMMDD format
for dir in dirs:
match = re.search(pattern, dir)
if match:
return dir
return None

@staticmethod
def check_file_cp(reads, restore_dir):
"""Check that file not already coppied to restore directory"""
Expand Down Expand Up @@ -70,20 +80,22 @@ def check_file_cp(reads, restore_dir):
def parse_sample_sheet(sample_sheet, restore_dir):
"""Parse sample sheets for sample meta data"""
csv_dict = {}
seqrun = Missing.get_seqrun_from_filepath(sample_sheet)
with open(sample_sheet, "r", encoding="utf-8") as fin:
for line in fin:
if line.endswith("saureus\n"):
line = line.rstrip()
sample_id = line.split(",")[-1].split("_")[1]
species = line.split(",")[-1].split("_")[2]
try:
clarity_id = line.split(",")[0].split(":")[1]
clarity_sample_meta = line.split(",")[0].split(":")[1]
except IndexError:
clarity_id = line.split(",")[0]
clarity_sample_meta = line.split(",")[0]
try:
clarity_group_id = clarity_id.split("_")[1]
clarity_group_id = clarity_sample_meta.split("_")[1]
except IndexError:
clarity_group_id = clarity_id
clarity_group_id = clarity_sample_meta
clarity_sample_id = clarity_sample_meta.split("_")[0]
if ":" in line:
parent_dir = os.path.join(
line.split(":")[0].rstrip("SampleSheet.csv"),
Expand All @@ -95,12 +107,14 @@ def parse_sample_sheet(sample_sheet, restore_dir):
"Data/Intensities/BaseCalls/"
)
try:
paired_reads = Missing.find_files(r'^' + clarity_id, parent_dir)
paired_reads = Missing.find_files(r'^' + clarity_sample_id, parent_dir)
if len(paired_reads) == 2 and paired_reads[0].endswith(".gz"):
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -113,8 +127,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
[restore_dir]*len(spring_fpaths)
))[0]
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
paired_reads,
spring_fpaths,
restored_spring_fpaths
Expand All @@ -124,8 +140,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if len(paired_reads) == 2:
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -140,8 +158,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if paired_read.endswith(".fastq.gz")]
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand All @@ -151,8 +171,10 @@ def parse_sample_sheet(sample_sheet, restore_dir):
if paired_read.endswith(".fastq.gz")]
restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
csv_dict[sample_id] = [
clarity_sample_id,
clarity_group_id,
species,
seqrun,
restored_reads_fpaths,
None,
paired_reads
Expand Down Expand Up @@ -253,14 +275,14 @@ def create_bash_script(csv_dict, restore_dir):
jcp_command = ""
unspring_command = ""
try:
spring_fpaths, restored_fpaths = csv_dict[sample][3][0], csv_dict[sample][4]
read1, _ = csv_dict[sample][2]
spring_fpaths, restored_fpaths = csv_dict[sample][5][0], csv_dict[sample][6]
read1, _ = csv_dict[sample][4]
if not os.path.exists(restored_fpaths) and not os.path.exists(read1):
jcp_command = f'/fs2/sw/bnf-scripts/jcp {spring_fpaths} {restore_dir}/ && '
unspring_command = f'/fs2/sw/bnf-scripts/unspring_file.pl {restored_fpaths} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n'
spring_command = spring_command + jcp_command + unspring_command
except TypeError:
for read_fpath in csv_dict[sample][4]:
for read_fpath in csv_dict[sample][6]:
jcp_command = f'/fs2/sw/bnf-scripts/jcp {read_fpath} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n'
spring_command = spring_command + jcp_command
bash_script = shell_script_path + shell_fail_count + spring_command + shell_for_loop
Expand All @@ -272,12 +294,12 @@ def remove_empty_files(csv_dict):
empty_files_dict = {}
for sample in csv_dict:
try:
file_size_r1 = os.path.getsize(csv_dict[sample][2][0]) / (1024 * 1024)
file_size_r2 = os.path.getsize(csv_dict[sample][2][1]) / (1024 * 1024)
file_size_r1 = os.path.getsize(csv_dict[sample][4][0]) / (1024 * 1024)
file_size_r2 = os.path.getsize(csv_dict[sample][4][1]) / (1024 * 1024)
if file_size_r1 < 10 or file_size_r2 < 10:
empty_files_dict[sample] = csv_dict[sample]
except FileNotFoundError:
print(f"WARN: {sample} read files ({csv_dict[sample][2][0]} and/or {csv_dict[sample][2][1]}) could not be found!")
print(f"WARN: {sample} read files ({csv_dict[sample][4][0]} and/or {csv_dict[sample][4][1]}) could not be found!")
except IndexError:
print(csv_dict[sample])
for empty_file in list(empty_files_dict.keys()):
Expand Down
13 changes: 8 additions & 5 deletions jasentool/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ class Utils:
def write_out_csv(csv_dict, assay, platform, out_fpath):
"""Write out file as csv"""
with open(out_fpath, 'w+', encoding="utf-8") as csvfile:
fieldnames = ["id", "group", "species", "assay", "platform", "read1", "read2"] #header
fieldnames = ["id", "clarity_sample_id", "group", "species", "assay",
"platform", "sequencing_run", "read1", "read2"] #header
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for sample in csv_dict:
row_dict = {"id":sample, "group": csv_dict[sample][0],
"species": csv_dict[sample][1], "assay": assay,
"platform": platform, "read1": csv_dict[sample][2][0],
"read2": csv_dict[sample][2][1]} #write rows to CSV
row_dict = {"id": sample, "clarity_sample_id": csv_dict[sample][0],
"group": csv_dict[sample][1], "species": csv_dict[sample][2],
"assay": assay, "platform": platform,
"sequencing_run": csv_dict[sample][3],
"read1": csv_dict[sample][4][0],
"read2": csv_dict[sample][4][1]} #write rows to CSV
writer.writerow(row_dict)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
"requests",
"tqdm",
"pandas",
"pymongo",
"pymongo==3.13",
"openpyxl",
"biopython"
]
Expand Down
Loading