publishdir update

friendsofstrandseq · Jan 17, 2024 · 8aaea88 · 8aaea88
1 parent fb07c6e
commit 8aaea88
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 98 deletions.
diff --git a/workflow/envs/mc_base.yaml b/workflow/envs/mc_base.yaml
@@ -20,3 +20,5 @@ dependencies:
   # ArbiGent Hufsah deps
   - pytables
   - xopen
+  # Genome browsing
+  - pybigwig
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -778,10 +778,10 @@ def get_all_plots(wildcards):
 
         l_outputs.extend(
             expand(
-                "{folder}/{sample}/config/{conda}.yaml",
+                "{folder}/{sample}/config/{conda_env}.yaml",
                 folder=config["data_location"],
                 sample=wildcards.sample,
-                conda=conda_envs,
+                conda_env=conda_envs,
             ),
         )
 
@@ -929,10 +929,10 @@ def get_all_plots(wildcards):
 
         l_outputs.extend(
             expand(
-                "{folder}/{sample}/config/conda_export/{conda}.yaml",
+                "{folder}/{sample}/config/conda_export/{conda_env}.yaml",
                 folder=config["data_location"],
                 sample=wildcards.sample,
-                conda=conda_envs,
+                conda_env=conda_envs,
             ),
         )
 
@@ -959,82 +959,25 @@ def get_all_plots(wildcards):
     return l_outputs
 
 
+
+
 def publishdir_fct_mc(wildcards):
     """
-    Restricted for ASHLEYS at the moment
-    Backup files on a secondary location
+    Function to generate a list of files and directories for backup.
     """
-    list_files_to_copy = [
-        # ASHLEYS
-        "{folder}/{sample}/cell_selection/labels_raw.tsv",
-        "{folder}/{sample}/cell_selection/labels.tsv",
-        "{folder}/{sample}/counts/{sample}.info_raw",
-        "{folder}/{sample}/counts/{sample}.txt.raw.gz",
-        # "{folder}/{sample}/config/config.yaml",
-        # MC
-        "{folder}/{sample}/config/config.yaml",
-    ]
 
-    list_files_to_copy += [
+
+    list_files_to_copy = [
         e for e in get_all_plots(wildcards) if "publishdir_outputs_mc.ok" not in e
     ]
 
-    final_list = [
-        expand(e, folder=config["data_location"], sample=wildcards.sample)
-        for e in list_files_to_copy
+    # Expand the paths for files
+    expanded_files = [
+        expand(file_path, folder=config["data_location"], sample=wildcards.sample)
+        for file_path in list_files_to_copy
     ]
-    final_list = [sub_e for e in final_list for sub_e in e]
-    final_list.extend(
-        expand(
-            "{folder}/{sample}/plots/counts/CountComplete.{plottype_counts}.pdf",
-            folder=config["data_location"],
-            sample=wildcards.sample,
-            plottype_counts=plottype_counts,
-        )
-    )
-
-    if config["use_light_data"] is False:
-        final_list.extend(
-            expand(
-                "{folder}/{sample}/plots/plate/ashleys_plate_{plate_plot}.pdf",
-                folder=config["data_location"],
-                sample=wildcards.sample,
-                plate_plot=["predictions", "probabilities"],
-            )
-        )
-        final_list.extend(
-            expand(
-                "{folder}/{sample}/cell_selection/labels_positive_control_corrected.tsv",
-                folder=config["data_location"],
-                sample=wildcards.sample,
-            )
-        )
-        final_list.extend(
-            expand(
-                "{folder}/{sample}/config/bypass_cell.txt",
-                folder=config["data_location"],
-                sample=wildcards.sample,
-            )
-        )
-
-    # folders_to_keep = [
-    #     "plots",
-    #     "snv_calls",
-    #     "segmentation",
-    #     "haplotag",
-    #     "strandphaser",
-    #     "ploidy",
-    #     "stats",
-    #     "mosaiclassifier",
-    # ]
-
-    # final_list += expand(
-    #     "{folder}/{sample}/{folder_to_keep}/",
-    #     folder=config["data_location"],
-    #     sample=wildcards.sample,
-    #     folder_to_keep=folders_to_keep,
-    # )
-
+    final_list = [sub_e for e in expanded_files for sub_e in e]
     # print(final_list)
 
+
     return final_list
diff --git a/workflow/rules/external_data.smk b/workflow/rules/external_data.smk
@@ -1,12 +1,15 @@
 import os
-from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
+# from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
 
-HTTP = HTTPRemoteProvider()
+storage:
+    provider="http",
+    # HTTP = HTTPRemoteProvider()
 
 
 rule dl_example_data:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://sandbox.zenodo.org/record/1074721/files/TEST_EXAMPLE_DATA.zip",
             keep_local=True,
         ),
@@ -20,7 +23,8 @@ rule dl_example_data:
 
 rule download_hg19_reference:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.fa.gz",
             keep_local=True,
         ),
@@ -41,7 +45,8 @@ rule download_hg19_reference:
 
 rule download_hg38_reference:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz",
             keep_local=True,
         ),
@@ -62,7 +67,8 @@ rule download_hg38_reference:
 
 rule download_T2T_reference:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz",
             keep_local=True,
         ),
@@ -83,7 +89,8 @@ rule download_T2T_reference:
 
 rule download_mm10_reference:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz",
             keep_local=True,
         ),
@@ -104,7 +111,8 @@ rule download_mm10_reference:
 
 rule download_T2T_tarball:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz",
             keep_local=True,
         ),
@@ -124,7 +132,8 @@ rule download_T2T_tarball:
 
 rule download_arbigent_mappability_track:
     input:
-        HTTP.remote(
+        # HTTP.remote(
+        storage.http(
             "https://zenodo.org/record/7697400/files/mapping_counts_allchrs_hg38.txt",
             keep_local=True,
         ),
@@ -143,7 +152,8 @@ rule download_arbigent_mappability_track:
 rule download_scnova_data:
     input:
         ancient(
-            HTTP.remote(
+            # HTTP.remote(
+            storage.http(
                 "https://zenodo.org/record/7697400/files/scNOVA_data_models.zip",
                 keep_local=True,
             )

diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk
@@ -38,14 +38,14 @@ rule divide_pdf:
         "{folder}/{sample}/plots/counts/CountComplete.raw.pdf",
     output:
         report(
-            "{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf",
+            "{folder}/{sample}/plots/counts_raw/{cell}.{i, \\d+}.pdf",
             caption="../report/mosaic_counts.rst",
             category="Mosaic counts cellwise",
             subcategory="{sample}",
             labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"},
         ),
     log:
-        "{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \d+}.log",
+        "{folder}/log/{sample}/plots/counts_raw/{cell}.{i, \\d+}.log",
     conda:
         "../envs/mc_base.yaml"
     resources:

diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
@@ -10,7 +10,7 @@ rule check_sm_tag:
     shell:
         """
         sample_name="{wildcards.sample}"
-        sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\([^\\t]*\).*/\\1/g")
+        sm_tag=$(samtools view -H {input} | grep '^@RG' | sed "s/.*SM:\\([^\\t]*\\).*/\\1/g")
 
         if [[ $sample_name == $sm_tag ]]; then
             echo "{input}: $sm_tag $sample_name OK" > {output}

diff --git a/workflow/scripts/utils/publishdir.py b/workflow/scripts/utils/publishdir.py
@@ -2,6 +2,7 @@
 import os
 
 
+# Function to run shell commands and print the output
 def run_command(command):
     process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
     while True:
@@ -14,33 +15,41 @@ def run_command(command):
     return rc
 
 
+# Configuration and wildcards from Snakemake
 data_location = snakemake.config["data_location"]
 publishdir = snakemake.config["publishdir"]
 run = snakemake.wildcards.folder.split("/")[-1]
-print(snakemake.wildcards.folder)
-print(run)
+sample = snakemake.wildcards.sample
+
+# Directories to copy entirely
+directories_to_copy = [
+    f"{data_location}/{sample}/plots/",
+    f"{data_location}/{sample}/haplotag/bam/",
+    f"{data_location}/{sample}/mosaiclassifier/",
+    f"{data_location}/{sample}/counts/",
+    f"{data_location}/{sample}/cell_selection/",
+    f"{data_location}/{sample}/config/",
+    f"{data_location}/{sample}/segmentation/",
+    f"{data_location}/{sample}/snv_calls/",
+    f"{data_location}/{sample}/stats/",
+    # Add other directories as needed
+]
 
 # Create base directory to maintain the structure
-os.makedirs(f"{publishdir}/{run}", exist_ok=True)
+os.makedirs(f"{publishdir}/{run}/{sample}", exist_ok=True)
 
-for item in list(snakemake.input.list_publishdir):
+for item in directories_to_copy:
     print(item)
-    # Replace the base path with the destination path
     destination_path = item.replace(data_location, f"{publishdir}/{run}")
 
     if os.path.isdir(item):
-        # Ensure the destination directory exists
-        os.makedirs(destination_path, exist_ok=True)
         # Copy the entire directory recursively
-        rsync_command = (
-            f"rsync --ignore-existing -avzh --progress {item}/ {destination_path}/"
-        )
-    else:
-        # Copy the file (including parent directory structure)
-        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
         rsync_command = (
             f"rsync --ignore-existing -avzh --progress {item} {destination_path}"
         )
+    else:
+        # If it's a file or the directory doesn't exist, skip
+        continue
 
     print(rsync_command)
     run_command(rsync_command)