Skip to content

Commit

Permalink
publishdir update
Browse files Browse the repository at this point in the history
  • Loading branch information
weber8thomas committed Jan 17, 2024
1 parent fb07c6e commit 8aaea88
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 98 deletions.
2 changes: 2 additions & 0 deletions workflow/envs/mc_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ dependencies:
# ArbiGent Hufsah deps
- pytables
- xopen
# Genome browsing
- pybigwig
87 changes: 15 additions & 72 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -778,10 +778,10 @@ def get_all_plots(wildcards):

l_outputs.extend(
expand(
"{folder}/{sample}/config/{conda}.yaml",
"{folder}/{sample}/config/{conda_env}.yaml",
folder=config["data_location"],
sample=wildcards.sample,
conda=conda_envs,
conda_env=conda_envs,
),
)

Expand Down Expand Up @@ -929,10 +929,10 @@ def get_all_plots(wildcards):

l_outputs.extend(
expand(
"{folder}/{sample}/config/conda_export/{conda}.yaml",
"{folder}/{sample}/config/conda_export/{conda_env}.yaml",
folder=config["data_location"],
sample=wildcards.sample,
conda=conda_envs,
conda_env=conda_envs,
),
)

Expand All @@ -959,82 +959,25 @@ def get_all_plots(wildcards):
return l_outputs




def publishdir_fct_mc(wildcards):
"""
Restricted for ASHLEYS at the moment
Backup files on a secondary location
Function to generate a list of files and directories for backup.
"""
list_files_to_copy = [
# ASHLEYS
"{folder}/{sample}/cell_selection/labels_raw.tsv",
"{folder}/{sample}/cell_selection/labels.tsv",
"{folder}/{sample}/counts/{sample}.info_raw",
"{folder}/{sample}/counts/{sample}.txt.raw.gz",
# "{folder}/{sample}/config/config.yaml",
# MC
"{folder}/{sample}/config/config.yaml",
]

list_files_to_copy += [

list_files_to_copy = [
e for e in get_all_plots(wildcards) if "publishdir_outputs_mc.ok" not in e
]

final_list = [
expand(e, folder=config["data_location"], sample=wildcards.sample)
for e in list_files_to_copy
# Expand the paths for files
expanded_files = [
expand(file_path, folder=config["data_location"], sample=wildcards.sample)
for file_path in list_files_to_copy
]
final_list = [sub_e for e in final_list for sub_e in e]
final_list.extend(
expand(
"{folder}/{sample}/plots/counts/CountComplete.{plottype_counts}.pdf",
folder=config["data_location"],
sample=wildcards.sample,
plottype_counts=plottype_counts,
)
)

if config["use_light_data"] is False:
final_list.extend(
expand(
"{folder}/{sample}/plots/plate/ashleys_plate_{plate_plot}.pdf",
folder=config["data_location"],
sample=wildcards.sample,
plate_plot=["predictions", "probabilities"],
)
)
final_list.extend(
expand(
"{folder}/{sample}/cell_selection/labels_positive_control_corrected.tsv",
folder=config["data_location"],
sample=wildcards.sample,
)
)
final_list.extend(
expand(
"{folder}/{sample}/config/bypass_cell.txt",
folder=config["data_location"],
sample=wildcards.sample,
)
)

# folders_to_keep = [
# "plots",
# "snv_calls",
# "segmentation",
# "haplotag",
# "strandphaser",
# "ploidy",
# "stats",
# "mosaiclassifier",
# ]

# final_list += expand(
# "{folder}/{sample}/{folder_to_keep}/",
# folder=config["data_location"],
# sample=wildcards.sample,
# folder_to_keep=folders_to_keep,
# )

final_list = [sub_e for e in expanded_files for sub_e in e]
# print(final_list)


return final_list
30 changes: 20 additions & 10 deletions workflow/rules/external_data.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import os
from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
# from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider

HTTP = HTTPRemoteProvider()
storage:
provider="http",
# HTTP = HTTPRemoteProvider()


rule dl_example_data:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://sandbox.zenodo.org/record/1074721/files/TEST_EXAMPLE_DATA.zip",
keep_local=True,
),
Expand All @@ -20,7 +23,8 @@ rule dl_example_data:

rule download_hg19_reference:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.fa.gz",
keep_local=True,
),
Expand All @@ -41,7 +45,8 @@ rule download_hg19_reference:

rule download_hg38_reference:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz",
keep_local=True,
),
Expand All @@ -62,7 +67,8 @@ rule download_hg38_reference:

rule download_T2T_reference:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz",
keep_local=True,
),
Expand All @@ -83,7 +89,8 @@ rule download_T2T_reference:

rule download_mm10_reference:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz",
keep_local=True,
),
Expand All @@ -104,7 +111,8 @@ rule download_mm10_reference:

rule download_T2T_tarball:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz",
keep_local=True,
),
Expand All @@ -124,7 +132,8 @@ rule download_T2T_tarball:

rule download_arbigent_mappability_track:
input:
HTTP.remote(
# HTTP.remote(
storage.http(
"https://zenodo.org/record/7697400/files/mapping_counts_allchrs_hg38.txt",
keep_local=True,
),
Expand All @@ -143,7 +152,8 @@ rule download_arbigent_mappability_track:
rule download_scnova_data:
input:
ancient(
HTTP.remote(
# HTTP.remote(
storage.http(
"https://zenodo.org/record/7697400/files/scNOVA_data_models.zip",
keep_local=True,
)
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/plots.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ rule divide_pdf:
"{folder}/{sample}/plots/counts/CountComplete.raw.pdf",
output:
report(
"{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf",
"{folder}/{sample}/plots/counts_raw/{cell}.{i, \\d+}.pdf",
caption="../report/mosaic_counts.rst",
category="Mosaic counts cellwise",
subcategory="{sample}",
labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"},
),
log:
"{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \d+}.log",
"{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \\d+}.log",
conda:
"../envs/mc_base.yaml"
resources:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/utils.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ rule check_sm_tag:
shell:
"""
sample_name="{wildcards.sample}"
sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\([^\\t]*\).*/\\1/g")
sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\\([^\\t]*\\).*/\\1/g")
if [[ $sample_name == $sm_tag ]]; then
echo "{input}: $sm_tag $sample_name OK" > {output}
Expand Down
35 changes: 22 additions & 13 deletions workflow/scripts/utils/publishdir.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os


# Function to run shell commands and print the output
def run_command(command):
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
while True:
Expand All @@ -14,33 +15,41 @@ def run_command(command):
return rc


# Configuration and wildcards from Snakemake
data_location = snakemake.config["data_location"]
publishdir = snakemake.config["publishdir"]
run = snakemake.wildcards.folder.split("/")[-1]
print(snakemake.wildcards.folder)
print(run)
sample = snakemake.wildcards.sample

# Directories to copy entirely
directories_to_copy = [
f"{data_location}/{sample}/plots/",
f"{data_location}/{sample}/haplotag/bam/",
f"{data_location}/{sample}/mosaiclassifier/",
f"{data_location}/{sample}/counts/",
f"{data_location}/{sample}/cell_selection/",
f"{data_location}/{sample}/config/",
f"{data_location}/{sample}/segmentation/",
f"{data_location}/{sample}/snv_calls/",
f"{data_location}/{sample}/stats/",
# Add other directories as needed
]

# Create base directory to maintain the structure
os.makedirs(f"{publishdir}/{run}", exist_ok=True)
os.makedirs(f"{publishdir}/{run}/{sample}", exist_ok=True)

for item in list(snakemake.input.list_publishdir):
for item in directories_to_copy:
print(item)
# Replace the base path with the destination path
destination_path = item.replace(data_location, f"{publishdir}/{run}")

if os.path.isdir(item):
# Ensure the destination directory exists
os.makedirs(destination_path, exist_ok=True)
# Copy the entire directory recursively
rsync_command = (
f"rsync --ignore-existing -avzh --progress {item}/ {destination_path}/"
)
else:
# Copy the file (including parent directory structure)
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
rsync_command = (
f"rsync --ignore-existing -avzh --progress {item} {destination_path}"
)
else:
# If it's a file or the directory doesn't exist, skip
continue

print(rsync_command)
run_command(rsync_command)

0 comments on commit 8aaea88

Please sign in to comment.