diff --git a/workflow/envs/mc_base.yaml b/workflow/envs/mc_base.yaml index 7c4dd81c..97c0ff0b 100644 --- a/workflow/envs/mc_base.yaml +++ b/workflow/envs/mc_base.yaml @@ -20,3 +20,5 @@ dependencies: # ArbiGent Hufsah deps - pytables - xopen + # Genome browsing + - pybigwig diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d6bc5c6b..5853d5d4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -778,10 +778,10 @@ def get_all_plots(wildcards): l_outputs.extend( expand( - "{folder}/{sample}/config/{conda}.yaml", + "{folder}/{sample}/config/{conda_env}.yaml", folder=config["data_location"], sample=wildcards.sample, - conda=conda_envs, + conda_env=conda_envs, ), ) @@ -929,10 +929,10 @@ def get_all_plots(wildcards): l_outputs.extend( expand( - "{folder}/{sample}/config/conda_export/{conda}.yaml", + "{folder}/{sample}/config/conda_export/{conda_env}.yaml", folder=config["data_location"], sample=wildcards.sample, - conda=conda_envs, + conda_env=conda_envs, ), ) @@ -959,82 +959,25 @@ def get_all_plots(wildcards): return l_outputs + + def publishdir_fct_mc(wildcards): """ - Restricted for ASHLEYS at the moment - Backup files on a secondary location + Function to generate a list of files and directories for backup. """ - list_files_to_copy = [ - # ASHLEYS - "{folder}/{sample}/cell_selection/labels_raw.tsv", - "{folder}/{sample}/cell_selection/labels.tsv", - "{folder}/{sample}/counts/{sample}.info_raw", - "{folder}/{sample}/counts/{sample}.txt.raw.gz", - # "{folder}/{sample}/config/config.yaml", - # MC - "{folder}/{sample}/config/config.yaml", - ] - list_files_to_copy += [ + + list_files_to_copy = [ e for e in get_all_plots(wildcards) if "publishdir_outputs_mc.ok" not in e ] - final_list = [ - expand(e, folder=config["data_location"], sample=wildcards.sample) - for e in list_files_to_copy + # Expand the paths for files + expanded_files = [ + expand(file_path, folder=config["data_location"], sample=wildcards.sample) + for file_path in list_files_to_copy ] - final_list = [sub_e for e in final_list for sub_e in e] - final_list.extend( - expand( - "{folder}/{sample}/plots/counts/CountComplete.{plottype_counts}.pdf", - folder=config["data_location"], - sample=wildcards.sample, - plottype_counts=plottype_counts, - ) - ) - - if config["use_light_data"] is False: - final_list.extend( - expand( - "{folder}/{sample}/plots/plate/ashleys_plate_{plate_plot}.pdf", - folder=config["data_location"], - sample=wildcards.sample, - plate_plot=["predictions", "probabilities"], - ) - ) - final_list.extend( - expand( - "{folder}/{sample}/cell_selection/labels_positive_control_corrected.tsv", - folder=config["data_location"], - sample=wildcards.sample, - ) - ) - final_list.extend( - expand( - "{folder}/{sample}/config/bypass_cell.txt", - folder=config["data_location"], - sample=wildcards.sample, - ) - ) - - # folders_to_keep = [ - # "plots", - # "snv_calls", - # "segmentation", - # "haplotag", - # "strandphaser", - # "ploidy", - # "stats", - # "mosaiclassifier", - # ] - - # final_list += expand( - # "{folder}/{sample}/{folder_to_keep}/", - # folder=config["data_location"], - # sample=wildcards.sample, - # folder_to_keep=folders_to_keep, - # ) - + final_list = [sub_e for e in expanded_files for sub_e in e] # print(final_list) + return final_list diff --git a/workflow/rules/external_data.smk b/workflow/rules/external_data.smk index 634ac216..9b417e95 100644 --- a/workflow/rules/external_data.smk +++ b/workflow/rules/external_data.smk @@ -1,12 +1,15 @@ import os -from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider +# from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider -HTTP = HTTPRemoteProvider() +storage: + provider="http", + # HTTP = HTTPRemoteProvider() rule dl_example_data: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://sandbox.zenodo.org/record/1074721/files/TEST_EXAMPLE_DATA.zip", keep_local=True, ), @@ -20,7 +23,8 @@ rule dl_example_data: rule download_hg19_reference: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.fa.gz", keep_local=True, ), @@ -41,7 +45,8 @@ rule download_hg19_reference: rule download_hg38_reference: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz", keep_local=True, ), @@ -62,7 +67,8 @@ rule download_hg38_reference: rule download_T2T_reference: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz", keep_local=True, ), @@ -83,7 +89,8 @@ rule download_T2T_reference: rule download_mm10_reference: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz", keep_local=True, ), @@ -104,7 +111,8 @@ rule download_mm10_reference: rule download_T2T_tarball: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz", keep_local=True, ), @@ -124,7 +132,8 @@ rule download_T2T_tarball: rule download_arbigent_mappability_track: input: - HTTP.remote( + # HTTP.remote( + storage.http( "https://zenodo.org/record/7697400/files/mapping_counts_allchrs_hg38.txt", keep_local=True, ), @@ -143,7 +152,8 @@ rule download_arbigent_mappability_track: rule download_scnova_data: input: ancient( - HTTP.remote( + # HTTP.remote( + storage.http( "https://zenodo.org/record/7697400/files/scNOVA_data_models.zip", keep_local=True, ) diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk index 41fa28de..65f316f0 100644 --- a/workflow/rules/plots.smk +++ b/workflow/rules/plots.smk @@ -38,14 +38,14 @@ rule divide_pdf: "{folder}/{sample}/plots/counts/CountComplete.raw.pdf", output: report( - "{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf", + "{folder}/{sample}/plots/counts_raw/{cell}.{i, \\d+}.pdf", caption="../report/mosaic_counts.rst", category="Mosaic counts cellwise", subcategory="{sample}", labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"}, ), log: - "{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \d+}.log", + "{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \\d+}.log", conda: "../envs/mc_base.yaml" resources: diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 5eeff593..31cea6a9 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -10,7 +10,7 @@ rule check_sm_tag: shell: """ sample_name="{wildcards.sample}" - sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\([^\\t]*\).*/\\1/g") + sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\\([^\\t]*\\).*/\\1/g") if [[ $sample_name == $sm_tag ]]; then echo "{input}: $sm_tag $sample_name OK" > {output} diff --git a/workflow/scripts/utils/publishdir.py b/workflow/scripts/utils/publishdir.py index 58dd36ad..a442fc03 100644 --- a/workflow/scripts/utils/publishdir.py +++ b/workflow/scripts/utils/publishdir.py @@ -2,6 +2,7 @@ import os +# Function to run shell commands and print the output def run_command(command): process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) while True: @@ -14,33 +15,41 @@ def run_command(command): return rc +# Configuration and wildcards from Snakemake data_location = snakemake.config["data_location"] publishdir = snakemake.config["publishdir"] run = snakemake.wildcards.folder.split("/")[-1] -print(snakemake.wildcards.folder) -print(run) +sample = snakemake.wildcards.sample + +# Directories to copy entirely +directories_to_copy = [ + f"{data_location}/{sample}/plots/", + f"{data_location}/{sample}/haplotag/bam/", + f"{data_location}/{sample}/mosaiclassifier/", + f"{data_location}/{sample}/counts/", + f"{data_location}/{sample}/cell_selection/", + f"{data_location}/{sample}/config/", + f"{data_location}/{sample}/segmentation/", + f"{data_location}/{sample}/snv_calls/", + f"{data_location}/{sample}/stats/", + # Add other directories as needed +] # Create base directory to maintain the structure -os.makedirs(f"{publishdir}/{run}", exist_ok=True) +os.makedirs(f"{publishdir}/{run}/{sample}", exist_ok=True) -for item in list(snakemake.input.list_publishdir): +for item in directories_to_copy: print(item) - # Replace the base path with the destination path destination_path = item.replace(data_location, f"{publishdir}/{run}") if os.path.isdir(item): - # Ensure the destination directory exists - os.makedirs(destination_path, exist_ok=True) # Copy the entire directory recursively - rsync_command = ( - f"rsync --ignore-existing -avzh --progress {item}/ {destination_path}/" - ) - else: - # Copy the file (including parent directory structure) - os.makedirs(os.path.dirname(destination_path), exist_ok=True) rsync_command = ( f"rsync --ignore-existing -avzh --progress {item} {destination_path}" ) + else: + # If it's a file or the directory doesn't exist, skip + continue print(rsync_command) run_command(rsync_command)