From 791b7715b12f3439b1e4686039f53f3c3ef13ede Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Mon, 6 Mar 2023 15:32:07 +0100 Subject: [PATCH 01/33] Cleanup --- config/README.md | 2 - config/config.yaml | 13 ----- config/file_layout.yaml | 10 ---- config/samples.csv | 2 - envs/kraken2.yaml | 7 --- envs/synapse.yaml | 65 ---------------------- envs/umi_tools.yaml | 6 --- test/test_download.sh | 2 - workflow/Snakefile | 116 ---------------------------------------- workflow/common.smk | 40 -------------- 10 files changed, 263 deletions(-) delete mode 100644 config/README.md delete mode 100644 config/config.yaml delete mode 100644 config/file_layout.yaml delete mode 100644 config/samples.csv delete mode 100755 envs/kraken2.yaml delete mode 100644 envs/synapse.yaml delete mode 100755 envs/umi_tools.yaml delete mode 100644 test/test_download.sh delete mode 100644 workflow/Snakefile delete mode 100644 workflow/common.smk diff --git a/config/README.md b/config/README.md deleted file mode 100644 index 44c6f7e..0000000 --- a/config/README.md +++ /dev/null @@ -1,2 +0,0 @@ -Describe how to configure the workflow (using config.yaml and maybe additional files). -All of them need to be present with example entries inside of the config folder. diff --git a/config/config.yaml b/config/config.yaml deleted file mode 100644 index 64d03ef..0000000 --- a/config/config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -defaults: - bc_pattern: CCCCCCCCCCCCCCCCNNNNNNNNNN - cell_number: 1000 - -output_dir: "results" - -file_layout: "config/file_layout.yaml" -samples: "config/samples.csv" - -kraken: - threads: 8 - db_paths: - virus: /data/repository/kraken2_contaminome/virus_db \ No newline at end of file diff --git a/config/file_layout.yaml b/config/file_layout.yaml deleted file mode 100644 index 95fc405..0000000 --- a/config/file_layout.yaml +++ /dev/null @@ -1,10 +0,0 @@ -umi_tools_whitelist: - whitelist: {sample}_whitelist.txt - -umi_tools_extract: - read_one: {sample}_extracted_R1.fastq.gz - read_two: {sample}_extracted_R2.fastq.gz - -kraken: - report: {sample}_mapped_to_{db}.k2report - output: {sample}_mapped_to_{db}.kraken2 \ No newline at end of file diff --git a/config/samples.csv b/config/samples.csv deleted file mode 100644 index cc80bbb..0000000 --- a/config/samples.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,R1_path,R2_path -GSM4658507,/data/manke/group/koppstein/data/covid_data/GSM4658507/,/data/manke/group/koppstein/data/covid_data/GSM4658507/ \ No newline at end of file diff --git a/envs/kraken2.yaml b/envs/kraken2.yaml deleted file mode 100755 index 6668674..0000000 --- a/envs/kraken2.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - kraken2 - diff --git a/envs/synapse.yaml b/envs/synapse.yaml deleted file mode 100644 index 88a6939..0000000 --- a/envs/synapse.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: synapse_py -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - bcrypt=3.2.2=py311hd4cff14_1 - - bzip2=1.0.8=h7f98852_4 - - ca-certificates=2022.12.7=ha878542_0 - - cffi=1.15.1=py311h409f033_3 - - cryptography=39.0.0=py311h9b4c7bb_0 - - ld_impl_linux-64=2.39=hcc3a1bd_1 - - libblas=3.9.0=16_linux64_openblas - - libcblas=3.9.0=16_linux64_openblas - - libffi=3.4.2=h7f98852_5 - - libgcc-ng=12.2.0=h65d4601_19 - - libgfortran-ng=12.2.0=h69a702a_19 - - libgfortran5=12.2.0=h337968e_19 - - libgomp=12.2.0=h65d4601_19 - - liblapack=3.9.0=16_linux64_openblas - - libnsl=2.0.0=h7f98852_0 - - libopenblas=0.3.21=pthreads_h78a6416_3 - - libsodium=1.0.18=h36c2ea0_1 - - libsqlite=3.40.0=h753d276_0 - - libstdcxx-ng=12.2.0=h46fd767_19 - - libuuid=2.32.1=h7f98852_1000 - - libzlib=1.2.13=h166bdaf_4 - - ncurses=6.3=h27087fc_1 - - numpy=1.24.1=py311hbde0eaa_0 - - openssl=3.0.7=h0b41bf4_1 - - pandas=1.5.3=py311h2872171_0 - - paramiko=2.12.0=pyhd8ed1ab_0 - - pip=22.3.1=pyhd8ed1ab_0 - - pycparser=2.21=pyhd8ed1ab_0 - - pynacl=1.5.0=py311hd4cff14_2 - - pysftp=0.2.9=py_1 - - python=3.11.0=he550d4f_1_cpython - - python-dateutil=2.8.2=pyhd8ed1ab_0 - - python_abi=3.11=3_cp311 - - pytz=2022.7.1=pyhd8ed1ab_0 - - readline=8.1.2=h0f457ee_0 - - setuptools=66.1.1=pyhd8ed1ab_0 - - six=1.16.0=pyh6c4a22f_0 - - tk=8.6.12=h27826a3_0 - - tzdata=2022g=h191b570_0 - - wheel=0.38.4=pyhd8ed1ab_0 - - xz=5.2.6=h166bdaf_0 - - pip: - - certifi==2022.12.7 - - charset-normalizer==3.0.1 - - deprecated==1.2.13 - - idna==3.4 - - importlib-metadata==6.0.0 - - jeepney==0.8.0 - - keyring==23.4.1 - - keyrings-alt==3.1 - - requests==2.28.2 - - secretstorage==3.3.3 - - synapseclient==2.7.0 - - urllib3==1.26.14 - - wrapt==1.14.1 - - zipp==3.11.0 -prefix: /localenv/koppstein/anaconda/miniconda3/envs/synapse_py diff --git a/envs/umi_tools.yaml b/envs/umi_tools.yaml deleted file mode 100755 index acb24fe..0000000 --- a/envs/umi_tools.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - umi_tools diff --git a/test/test_download.sh b/test/test_download.sh deleted file mode 100644 index 9b68903..0000000 --- a/test/test_download.sh +++ /dev/null @@ -1,2 +0,0 @@ -# conda activate synapse_py -synapse -u koppstein get syn26011026 diff --git a/workflow/Snakefile b/workflow/Snakefile deleted file mode 100644 index 44d078a..0000000 --- a/workflow/Snakefile +++ /dev/null @@ -1,116 +0,0 @@ -# Main entrypoint of the workflow. -# Please follow the best practices: -# https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html, -# in particular regarding the standardized folder structure mentioned there. - - -include: "workflow/common.smk" - - -SAMPLES = list(METADATA.index) - -get_bc_pattern = _get_default('bc_pattern') -get_cell_number = _get_default('cell_number') - -paths = create_path_accessor() - -rule all: - input: expand(paths.kraken.output, sample=METADATA, db=['virus']) - -def find_path(wc): - "Return the R1_path or R2_path the given sample." - return METADATA.loc[str(wc.sample), "R{}_path".format(wc.num)] - -# symlink read one and two to resources/ -rule symlink_reads: - output: - "resources/{sample}_R{num}.fastq.gz", - params: - path=find_path - shell: - "ln -fs `readlink -f {params.path}` {output}" - -rule umi_tools_whitelist: - input: - read_one="resources/{sample}_R{num}.fastq.gz", - output: - paths.umi_tools_whitelist.whitelist - params: - bc_pattern=get_bc_pattern, - cell_number=get_cell_number, - conda: - "envs/umi_tools.yaml" - log: - "results/logs/whitelist/{sample}_whitelist.log" - shell: - "umi_tools whitelist " - "--stdin {input.read_one} " - "--bc-pattern={params.bc_pattern} " - "--set-cell-number={params.cell_number} " - "--log2stderr > {output} 2> {log} " - - -rule umi_tools_extract: - input: - read_one="resources/{sample}_R1.fastq.gz", - read_two="resources/{sample}_R2.fastq.gz", - whitelist=paths.umi_tools_whitelist.whitelist, - output: - read_one=paths.umi_tools_extract.read_one, - read_two=paths.umi_tools_extract.read_two, - params: - bc_pattern=get_bc_pattern, - cell_number=get_cell_number, - conda: - "envs/umi_tools.yaml" - log: - "results/logs/whitelist/{sample}_whitelist.log" - shell: - "umi_tools extract " - "--stdin {input.read_one} " - "--bc-pattern={params.bc_pattern} " - "--stdout {output.read_one} " - "--read2-in {input.read_two} " - "--read2-out {output.read_two} " - "--whitelist={input.whitelist} " - -def krak_db(wc): - return config['kraken']['db_paths'][str(wc.db)] - -rule kraken: - input: - read_one=paths.umi_tools_extract.read_one, - read_two=paths.umi_tools_extract.read_two, - output: - kraken_report=paths.kraken.report, - kraken_output=paths.kraken.output, - threads: - config['kraken']['threads'] - params: - db=krak_db - conda: - "envs/kraken2.yaml" - shell: - "kraken2 --db {params.db} " - "--threads {threads} " - "--report {output.kraken_report} " - "{input} " - "> {output.kraken_output} " - -# TODO: how to deal with clustering/deduping reads if we can't use BAM files? -# Do we even care about this? -# c.f. https://github.com/CGATOxford/UMI-tools/issues/436 -# also https://pubmed.ncbi.nlm.nih.gov/30351359/ -# Personally I think we want to use the UMIclusterer class, and if the -# reads are in the same UMI cluster *and* are mapping to the same kraken -# db, then we deduplicate them. - -rule get_synapse: - output: - "downloads/{sample}_R{num}.fastq.gz" - conda: - "envs/synapse.yaml" - params: - - shell: - "synapse -" \ No newline at end of file diff --git a/workflow/common.smk b/workflow/common.smk deleted file mode 100644 index 203deb4..0000000 --- a/workflow/common.smk +++ /dev/null @@ -1,40 +0,0 @@ -import pandas as pd -from box import Box -from pathlib import Path - -OUTDIR = Path(config["output_dir"]) - -METADATA = pd.read_csv( - (Path(workflow.basedir) / config["samples"]), - dtype=str, - ).set_index(["sample"], drop=False) - - -def _get_default(param): - "Return an input function (taking wildcards) that takes the given " - " parameter `param`, for a particular sample, otherwise looks up - "config["defaults"][param] if it is not specified." - def inner(wc): - val = METADATA.loc[str(wc.sample)][param] - if pd.isnan(val): - if param in config["defaults"]: - return config["defaults"][param] - else: - raise Exception( - "Error: param {param} not found for sample {sample} in " - "either config.yaml or samples.csv" - ) - return val - return inner - - -def create_path_accessor(prefix: Path = OUTDIR) -> Box: - """Create a Box to provide '.' access to hierarchy of paths""" - data = yaml.load(Path(config["file_layout"]).open(), Loader=yaml.SafeLoader) - paths = {} - for directory in data.keys(): - paths[directory] = {} - for file_alias, file_name in data[directory].items(): - p = str(prefix / directory / file_name) - paths[directory][file_alias] = str(p) - return Box(paths, frozen_box=True) From a69e97bbfcaae62876051db7e12d5e3882fc5290 Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Mon, 6 Mar 2023 15:34:32 +0100 Subject: [PATCH 02/33] Initial Commit --- SRA.tsv | 2 ++ Snakefile | 86 +++++++++++++++++++++++++++++++++++++++++++++++++ config.yaml | 2 ++ envs/tools.yaml | 9 ++++++ 4 files changed, 99 insertions(+) create mode 100644 SRA.tsv create mode 100644 Snakefile create mode 100644 config.yaml create mode 100644 envs/tools.yaml diff --git a/SRA.tsv b/SRA.tsv new file mode 100644 index 0000000..13e050f --- /dev/null +++ b/SRA.tsv @@ -0,0 +1,2 @@ +Samples +SRR10799852 diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..555f22e --- /dev/null +++ b/Snakefile @@ -0,0 +1,86 @@ +import glob +import os +import pandas as pd + +#configfile = "/data/manke/processing/momin/virome-scan/workflow/config.yaml", +data_dir = "/data/manke/processing/momin/virome-scan/workflow" + +accession_list = pd.read_table("SRA.tsv") +samples = list(accession_list.Samples.unique()) + + +rule all: + input: + expand("data/{sample}_1.fastq.gz", sample=samples), + expand("data/{sample}_2.fastq.gz", sample=samples), + expand("results/whitelist/{sample}_whitelist.txt", sample=samples), + expand("results/umi/{sample}_R1_extracted.fastq.gz",sample=samples), + expand("results/umi/{sample}_R2_extracted.fastq.gz",sample=samples) + + +rule download_sample: + output: + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz" + params: + outdir = "data", + threads = 8 + conda: + "envs/tools.yaml" + shell: + "parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {params.threads} --outdir {params.outdir} --gzip" + + +rule whitelist_generation: + input: + i1 = "data/{sample}_1.fastq.gz", + output: + "results/whitelist/{sample}_whitelist.txt" + conda: + "envs/tools.yaml" + log: + "results/log/whitelist/{sample}_umi_extract.log" + shell: + "umi_tools whitelist --stdin {input.i1} --bc-pattern={config[bc_pattern]} --set-cell-number={config[cell_no]} --log2stderr > {output} 2> {log}" + + +rule umi_extract: + input: + i1 = "data/{sample}_1.fastq.gz", + i2 = "data/{sample}_2.fastq.gz" + output: + o1 = "results/umi/{sample}_R1_extracted.fastq.gz", + o2 = "results/umi/{sample}_R2_extracted.fastq.gz" + params: + p1 = "results/whitelist/{sample}_whitelist.txt" + conda: + "envs/tools.yaml" + log: + "results/log/umi/{sample}_umi_extract.log" + shell: + "umi_tools extract --stdin {input.i1} --bc-pattern=CCCCCCCCCCCCCCCCNNNNNNNNNN --stdout {output.o1} --read2-in {input.i2} --read2-out={output.o2} --whitelist={params.p1}" + + +rule alignment: + input: + i1 = "results/umi/{sample}_R1_extracted.fastq.gz", + i2 = "results/umi/{sample}_R2_extracted.fastq.gz" + output: + o1 = "results/alignment/{sample}.sam" + params: + p1 = #TODO Path to Database #Config to the external Bowtie2 index ideally needs to be given by runtime config. + conda: + "envs/tools.yaml" + threads: 16 + log: + "results/log/alignment/{sample}_aln.log" + shell: + "bowtie2 -x {params.p1} -1 {input.i1} -2 {input.i2} --very-fast-local --no-unal -S {output.o1} -p {threads}" + + +onsuccess: + print("Snakemake finished successfully!") + +onerror: + print("Snakemake has failed!") + diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..e120fdd --- /dev/null +++ b/config.yaml @@ -0,0 +1,2 @@ +"bc_pattern" : CCCCCCCCCCCCCCCCNNNNNNNNNN +"cell_no" : 100 \ No newline at end of file diff --git a/envs/tools.yaml b/envs/tools.yaml new file mode 100644 index 0000000..842fc5a --- /dev/null +++ b/envs/tools.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - entrez-direct + - parallel-fastq-dump + - umi_tools + - bowtie2 \ No newline at end of file From e8805bc2c39f417e9788d3ce1091388c4e93b70b Mon Sep 17 00:00:00 2001 From: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> Date: Mon, 6 Mar 2023 15:42:56 +0100 Subject: [PATCH 03/33] Update README.md --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ec70b54..6b2fdd7 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,19 @@ -# Snakemake workflow: `Single-cell Virome Scan` +# Snakemake workflow: `Single-Cell Virome Scan` [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Tests/badge.svg?branch=main)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) -A Snakemake workflow for processing . +A Snakemake workflow for processing Single Cell Virome Scan. +## Note +This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. ## Usage +> snakemake --cores 8 --use-conda --configfile config.yaml --latency-wait 60 + + The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=maxplanck-ie%2Fsc-virome-scan). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). From f41d9a3e116ed45a07957cdd8f4afc266534f218 Mon Sep 17 00:00:00 2001 From: Saim Date: Thu, 16 Mar 2023 18:34:18 +0100 Subject: [PATCH 04/33] Updated Framework of Pipeline --- Snakefile | 119 ++++++++++++++------------------ config.yaml | 5 +- envs/kraken2.yaml | 6 ++ envs/samtools.yaml | 6 ++ rules/cellranger.smk | 18 +++++ rules/download_samples.smk | 21 ++++++ rules/envs/kraken2.yaml | 6 ++ rules/envs/samtools.yaml | 6 ++ rules/envs/tools.yaml | 9 +++ rules/extract_bam.smk | 9 +++ rules/extract_tags.smk | 13 ++++ rules/kraken2_mapping.smk | 14 ++++ rules/kraken2_mapping_local.smk | 13 ++++ scripts/bam_extract.py | 58 ++++++++++++++++ 14 files changed, 235 insertions(+), 68 deletions(-) create mode 100644 envs/kraken2.yaml create mode 100644 envs/samtools.yaml create mode 100644 rules/cellranger.smk create mode 100644 rules/download_samples.smk create mode 100644 rules/envs/kraken2.yaml create mode 100644 rules/envs/samtools.yaml create mode 100644 rules/envs/tools.yaml create mode 100644 rules/extract_bam.smk create mode 100644 rules/extract_tags.smk create mode 100644 rules/kraken2_mapping.smk create mode 100644 rules/kraken2_mapping_local.smk create mode 100644 scripts/bam_extract.py diff --git a/Snakefile b/Snakefile index 555f22e..a2f3fdd 100644 --- a/Snakefile +++ b/Snakefile @@ -3,79 +3,66 @@ import os import pandas as pd #configfile = "/data/manke/processing/momin/virome-scan/workflow/config.yaml", -data_dir = "/data/manke/processing/momin/virome-scan/workflow" +data_dir = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data" + accession_list = pd.read_table("SRA.tsv") samples = list(accession_list.Samples.unique()) +""" +samples, = glob_wildcards("/data/manke/processing/momin/virome-scan/sc-virome-scan/data/{sample}_L001_R1.fastq.gz") +print(samples) +""" + +#ruleorder: download_samples > cellranger > kraken2_mapping + +if config["files"] == "local": + include: "rules/kraken2_mapping_local.smk" + +else: + include: "rules/download_samples.smk" + include: "rules/kraken2_mapping.smk" + +def input_files(): + if config["files"] == "local": + l1 = [ + expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), + expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples)] + return(l1) + else: + l1 = [ + expand("data/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), + expand("data/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), + expand("data/{sample}_test.txt",sample=samples), + expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), + expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples) + ] + return(l1) + +include: "rules/cellranger.smk" +include: "rules/extract_bam.smk" +include: "rules/extract_tags.smk" + rule all: input: - expand("data/{sample}_1.fastq.gz", sample=samples), - expand("data/{sample}_2.fastq.gz", sample=samples), - expand("results/whitelist/{sample}_whitelist.txt", sample=samples), - expand("results/umi/{sample}_R1_extracted.fastq.gz",sample=samples), - expand("results/umi/{sample}_R2_extracted.fastq.gz",sample=samples) - - -rule download_sample: - output: - "data/{sample}_1.fastq.gz", - "data/{sample}_2.fastq.gz" - params: - outdir = "data", - threads = 8 - conda: - "envs/tools.yaml" - shell: - "parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {params.threads} --outdir {params.outdir} --gzip" - - -rule whitelist_generation: - input: - i1 = "data/{sample}_1.fastq.gz", - output: - "results/whitelist/{sample}_whitelist.txt" - conda: - "envs/tools.yaml" - log: - "results/log/whitelist/{sample}_umi_extract.log" - shell: - "umi_tools whitelist --stdin {input.i1} --bc-pattern={config[bc_pattern]} --set-cell-number={config[cell_no]} --log2stderr > {output} 2> {log}" - - -rule umi_extract: - input: - i1 = "data/{sample}_1.fastq.gz", - i2 = "data/{sample}_2.fastq.gz" - output: - o1 = "results/umi/{sample}_R1_extracted.fastq.gz", - o2 = "results/umi/{sample}_R2_extracted.fastq.gz" - params: - p1 = "results/whitelist/{sample}_whitelist.txt" - conda: - "envs/tools.yaml" - log: - "results/log/umi/{sample}_umi_extract.log" - shell: - "umi_tools extract --stdin {input.i1} --bc-pattern=CCCCCCCCCCCCCCCCNNNNNNNNNN --stdout {output.o1} --read2-in {input.i2} --read2-out={output.o2} --whitelist={params.p1}" - - -rule alignment: - input: - i1 = "results/umi/{sample}_R1_extracted.fastq.gz", - i2 = "results/umi/{sample}_R2_extracted.fastq.gz" - output: - o1 = "results/alignment/{sample}.sam" - params: - p1 = #TODO Path to Database #Config to the external Bowtie2 index ideally needs to be given by runtime config. - conda: - "envs/tools.yaml" - threads: 16 - log: - "results/log/alignment/{sample}_aln.log" - shell: - "bowtie2 -x {params.p1} -1 {input.i1} -2 {input.i2} --very-fast-local --no-unal -S {output.o1} -p {threads}" + input_files(), + expand("results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam",sample=samples), + expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), + expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) + + + + + + + + + + + + + onsuccess: diff --git a/config.yaml b/config.yaml index e120fdd..3681f39 100644 --- a/config.yaml +++ b/config.yaml @@ -1,2 +1,3 @@ -"bc_pattern" : CCCCCCCCCCCCCCCCNNNNNNNNNN -"cell_no" : 100 \ No newline at end of file +"files" : "local" +"chemistry" : "SC3Pv2" +"transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" diff --git a/envs/kraken2.yaml b/envs/kraken2.yaml new file mode 100644 index 0000000..516686a --- /dev/null +++ b/envs/kraken2.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - kraken2 \ No newline at end of file diff --git a/envs/samtools.yaml b/envs/samtools.yaml new file mode 100644 index 0000000..2c1c845 --- /dev/null +++ b/envs/samtools.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - samtools \ No newline at end of file diff --git a/rules/cellranger.smk b/rules/cellranger.smk new file mode 100644 index 0000000..b851101 --- /dev/null +++ b/rules/cellranger.smk @@ -0,0 +1,18 @@ +rule cellranger: + input: + i1 = expand("data/{sample}_test.txt", sample=samples) + output: + o1 = "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam" + priority: 90 + log: + "results/cellranger/{sample}/{sample}_cellranger.log" + params: + p1 = config["chemistry"], + p2 = config["transcriptome"], + p3 = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data/", + p4 = "/data/manke/processing/momin/virome-scan/kraken2/negative_ctrl/data/" + shell: + """ + cd results/cellranger/{wildcards.sample}/ + cellranger count --id {wildcards.sample} --fastqs {params.p4} --transcriptome {config[transcriptome]} --chemistry SC3Pv2 + """ diff --git a/rules/download_samples.smk b/rules/download_samples.smk new file mode 100644 index 0000000..4e77e03 --- /dev/null +++ b/rules/download_samples.smk @@ -0,0 +1,21 @@ +rule download_samples: + output: + o1 = "data/{sample}_S1_L001_R1_001.fastq.gz", + o2 = "data/{sample}_S1_L001_R2_001.fastq.gz", + o3 = temp("data/{sample}_test.txt") + params: + outdir = "data" + threads: 16 + priority: 100 + conda: + "envs/tools.yaml" + shell: + """ + parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip + mv data/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}_S1_L001_R1_001.fastq.gz + mv data/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}_S1_L001_R2_001.fastq.gz + touch {output.o3} + """ + + + \ No newline at end of file diff --git a/rules/envs/kraken2.yaml b/rules/envs/kraken2.yaml new file mode 100644 index 0000000..516686a --- /dev/null +++ b/rules/envs/kraken2.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - kraken2 \ No newline at end of file diff --git a/rules/envs/samtools.yaml b/rules/envs/samtools.yaml new file mode 100644 index 0000000..2c1c845 --- /dev/null +++ b/rules/envs/samtools.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - samtools \ No newline at end of file diff --git a/rules/envs/tools.yaml b/rules/envs/tools.yaml new file mode 100644 index 0000000..842fc5a --- /dev/null +++ b/rules/envs/tools.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - entrez-direct + - parallel-fastq-dump + - umi_tools + - bowtie2 \ No newline at end of file diff --git a/rules/extract_bam.smk b/rules/extract_bam.smk new file mode 100644 index 0000000..ad0a901 --- /dev/null +++ b/rules/extract_bam.smk @@ -0,0 +1,9 @@ +rule extract_bam: + input: + "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam" + output: + "results/cellranger/{sample}/unmapped_reads.sam" + conda: + "envs/samtools.yaml" + shell: + "samtools view -f 4 {input} > {output}" \ No newline at end of file diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk new file mode 100644 index 0000000..c2ee38d --- /dev/null +++ b/rules/extract_tags.smk @@ -0,0 +1,13 @@ +rule extract_tags: + input: + i1 = "results/cellranger/{sample}/unmapped_reads.sam", + i2 = "results/kraken2/{sample}/{sample}.kraken" + output: + "results/count_matrix/{sample}/count_matrix.tsv" + params: + p1 = "results/count_matrix/{sample}/", + p2 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" + log: + "results/count_matrix/{sample}_bam_extract.log" + shell: + "python3 scripts/bam_extract.py -i {input.i1} -k {input.i2} -b {params.p2} -o {params.p1} " \ No newline at end of file diff --git a/rules/kraken2_mapping.smk b/rules/kraken2_mapping.smk new file mode 100644 index 0000000..ef4ecd5 --- /dev/null +++ b/rules/kraken2_mapping.smk @@ -0,0 +1,14 @@ +rule kraken2_mapping: + input: + i1 = "data/{sample}_S1_L001_R2_001.fastq.gz" + output: + o1 = "results/kraken2/{sample}/{sample}.kraken", + o2 = "results/kraken2/{sample}/{sample}.report.txt" + conda: + "envs/kraken2.yaml" + params: + p1 = "/data/repository/kraken2_contaminome/virus_db" + log: + "results/logs/kraken2/{sample}_kraken.log" + shell: + "kraken2 --use-names --threads 4 --db {params.p1} --report {output.o2} {input.i1} > {output.o1} &> {log}" \ No newline at end of file diff --git a/rules/kraken2_mapping_local.smk b/rules/kraken2_mapping_local.smk new file mode 100644 index 0000000..a4fb5f7 --- /dev/null +++ b/rules/kraken2_mapping_local.smk @@ -0,0 +1,13 @@ +rule kraken2_mapping_local: + output: + o1 = "results/kraken2/{sample}/{sample}.kraken", + o2 = "results/kraken2/{sample}/{sample}.report.txt" + conda: + "envs/kraken2.yaml" + params: + p1 = "/data/repository/kraken2_contaminome/virus_db", + p2 = "data/{sample}_S1_L001_R2_001.fastq.gz" + log: + "results/logs/kraken2/{sample}_kraken_local.log" + shell: + "kraken2 --use-names --threads 4 --db {params.p1} --report {output.o2} {params.p2} > {output.o1} &> {log}" \ No newline at end of file diff --git a/scripts/bam_extract.py b/scripts/bam_extract.py new file mode 100644 index 0000000..ba6e6ff --- /dev/null +++ b/scripts/bam_extract.py @@ -0,0 +1,58 @@ +import pandas as pd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input_file", metavar="PATH", help="Path to your SAM file") +parser.add_argument("-k", "--kraken_input_file", metavar="FILE", help="Path of your .kraken file from Kraken output") +parser.add_argument("-b", "--barcode_file", metavar="FILE", help="Filtered Barcodes tsv file generated from Cell Ranger") +parser.add_argument("-o", "--output-file", metavar="PATH", help="Path to your output file") + +args = parser.parse_args() +file = args.input_file +tags_dict = {} + +#TODO: Check if the Kraken file is empty + +#Parsing SAM file and storing ReadName along with its TAGS in dictionary +with open(file) as f: + for line in f: + if line.startswith("@"): #Skipping header + continue + fields = line.strip().split("\t") + read_name = fields[0] + entries = fields[11:] + + for tag in entries: + tag_fields = tag.split(":") + tag_name = tag_fields[0] + tag_value = tag_fields[2] + if tag_name in ["CR", "CB", "UR", "UB"]: + if read_name in tags_dict: + tags_dict[read_name][tag_name] = tag_value #Adding tag to existing existing dictionary for the read name + else: + tags_dict[read_name] = {tag_name: tag_value} #Else creating new dictionary and adding rag to it +#Converting them to Pandas Dataframe +df = pd.DataFrame.from_dict(tags_dict, orient='index') +df.index.name = "Read Name" +df.reset_index(inplace=True) +df1 = df.drop_duplicates(subset=['CR','UR'],keep = 'last').reset_index(drop = True) +print(df1.head()) + + +#Reading Kraken Output and merging with the previous Dataframe +columns_name = ['status', 'Read Name', 'Tax_ID', 'length', 'LCA_mapping'] +df2 = pd.read_csv(args.kraken_input_file,sep='\t',names=columns_name, index_col=None) +merged = pd.merge(df1,df2,on='Read Name', how='inner') +merged_subset = merged.loc[:, ['Read Name', 'Tax_ID', 'CR', 'CB','UR', 'UB']] +print(merged_subset.head()) + +#Filtering dataframe based on barcodes reported by CellRanger +barcodes = pd.read_csv(args.barcode_file, compression='gzip',sep='\t', names=['CR']) +barcodes.CR = [x.strip().replace('-1', '') for x in barcodes.CR] +merged_barcodes = pd.merge(merged_subset,barcodes,on='CR', how='inner') +print(merged_barcodes.head()) + + +#Creating a count matrix and writing it to a file +result = merged_barcodes.pivot_table(index='Tax_ID', columns='CR', values='Read Name', aggfunc='count').fillna(0.).astype(int) +result.to_csv(args.output_file + "count_matrix.tsv", sep='\t') From 472631ba0bec91ca0264e435625b804baf882785 Mon Sep 17 00:00:00 2001 From: Saim Date: Wed, 5 Apr 2023 14:16:18 +0200 Subject: [PATCH 05/33] Minor changes --- Snakefile | 58 +++++++----------------------- config.yaml | 3 +- rules/download_samples_or_copy.smk | 22 ++++++++++++ 3 files changed, 36 insertions(+), 47 deletions(-) create mode 100644 rules/download_samples_or_copy.smk diff --git a/Snakefile b/Snakefile index a2f3fdd..1a42b62 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,7 @@ import pandas as pd data_dir = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data" -accession_list = pd.read_table("SRA.tsv") +accession_list = pd.read_table("jiang_analysis.tsv") samples = list(accession_list.Samples.unique()) """ @@ -14,56 +14,22 @@ samples, = glob_wildcards("/data/manke/processing/momin/virome-scan/sc-virome-sc print(samples) """ -#ruleorder: download_samples > cellranger > kraken2_mapping - -if config["files"] == "local": - include: "rules/kraken2_mapping_local.smk" - -else: - include: "rules/download_samples.smk" - include: "rules/kraken2_mapping.smk" - -def input_files(): - if config["files"] == "local": - l1 = [ - expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), - expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples)] - return(l1) - else: - l1 = [ - expand("data/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), - expand("data/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), - expand("data/{sample}_test.txt",sample=samples), - expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), - expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples) - ] - return(l1) - +include: "rules/download_samples_or_copy.smk" +include: "rules/kraken2_mapping.smk" include: "rules/cellranger.smk" include: "rules/extract_bam.smk" -include: "rules/extract_tags.smk" - +include: "rules/extract_tags.smk" rule all: input: - input_files(), - expand("results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam",sample=samples), - expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), - expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) - - - - - - - - - - - - - - + #expand("data/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), + #expand("data/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), + #expand("data/{sample}_test.txt",sample=samples), + #expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), + #expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples), + #expand("results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam",sample=samples), + #expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), + expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) onsuccess: print("Snakemake finished successfully!") diff --git a/config.yaml b/config.yaml index 3681f39..e481375 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,4 @@ -"files" : "local" +"files" : "proxy" "chemistry" : "SC3Pv2" "transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" +"dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_old/" \ No newline at end of file diff --git a/rules/download_samples_or_copy.smk b/rules/download_samples_or_copy.smk new file mode 100644 index 0000000..d7ae67b --- /dev/null +++ b/rules/download_samples_or_copy.smk @@ -0,0 +1,22 @@ +rule download_samples_or_copy: + output: + o1 = temp("data/{sample}_S1_L001_R1_001.fastq.gz"), + o2 = temp("data/{sample}_S1_L001_R2_001.fastq.gz") + params: + outdir = "data", + data_directory = config["dir"] + threads: 16 + priority: 100 + conda: + "envs/tools.yaml" + shell: + """ + if [ {config[files]} == 'local' ]; then + cp {config[dir]}/*.fastq.gz data/ + + else + parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip --tmpdir /data/manke/processing/momin/virome-scan/sc-virome-scan/tmp + mv data/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}_S1_L001_R1_001.fastq.gz + mv data/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}_S1_L001_R2_001.fastq.gz + fi + """ \ No newline at end of file From 926f86cbd8caf9c6e74209024290595c1c75de8d Mon Sep 17 00:00:00 2001 From: Saim Date: Wed, 5 Apr 2023 14:18:20 +0200 Subject: [PATCH 06/33] Purged rules --- rules/download_samples.smk | 21 --------------------- rules/kraken2_mapping_local.smk | 13 ------------- 2 files changed, 34 deletions(-) delete mode 100644 rules/download_samples.smk delete mode 100644 rules/kraken2_mapping_local.smk diff --git a/rules/download_samples.smk b/rules/download_samples.smk deleted file mode 100644 index 4e77e03..0000000 --- a/rules/download_samples.smk +++ /dev/null @@ -1,21 +0,0 @@ -rule download_samples: - output: - o1 = "data/{sample}_S1_L001_R1_001.fastq.gz", - o2 = "data/{sample}_S1_L001_R2_001.fastq.gz", - o3 = temp("data/{sample}_test.txt") - params: - outdir = "data" - threads: 16 - priority: 100 - conda: - "envs/tools.yaml" - shell: - """ - parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip - mv data/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}_S1_L001_R1_001.fastq.gz - mv data/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}_S1_L001_R2_001.fastq.gz - touch {output.o3} - """ - - - \ No newline at end of file diff --git a/rules/kraken2_mapping_local.smk b/rules/kraken2_mapping_local.smk deleted file mode 100644 index a4fb5f7..0000000 --- a/rules/kraken2_mapping_local.smk +++ /dev/null @@ -1,13 +0,0 @@ -rule kraken2_mapping_local: - output: - o1 = "results/kraken2/{sample}/{sample}.kraken", - o2 = "results/kraken2/{sample}/{sample}.report.txt" - conda: - "envs/kraken2.yaml" - params: - p1 = "/data/repository/kraken2_contaminome/virus_db", - p2 = "data/{sample}_S1_L001_R2_001.fastq.gz" - log: - "results/logs/kraken2/{sample}_kraken_local.log" - shell: - "kraken2 --use-names --threads 4 --db {params.p1} --report {output.o2} {params.p2} > {output.o1} &> {log}" \ No newline at end of file From 6cad1c3ee342790b52818db698252149ea2a2fd9 Mon Sep 17 00:00:00 2001 From: Saim Date: Thu, 6 Apr 2023 13:25:34 +0200 Subject: [PATCH 07/33] Added script for Kraken2 Plots --- scripts/kraken_plot.py | 115 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 scripts/kraken_plot.py diff --git a/scripts/kraken_plot.py b/scripts/kraken_plot.py new file mode 100644 index 0000000..09a87bc --- /dev/null +++ b/scripts/kraken_plot.py @@ -0,0 +1,115 @@ +""" +Summarizes and performs plotting of Clustermap for the total reads counts assigned for each family and species by Kraken2. + +This script processes all the Kraken2.report.txt files Kraken2 tool and organizes the number of reads(fragments) +mapped to each family and species level. The output of the scripts are two tab-separated files consisting of reads +mapped to Family level and Species level respectively. Depending on the TSV files, a clustermap is plotted to analyze +the distribution of taxons in the all report files of the samples. The Clustermap are saved to PNG file in the directory +specified by the user. + +Usage: + ./kraken_plot.py -i -o + +Outputs: + - familywise_taxonomic_readcounts.tsv + - specieswise_taxonomic_readcounts.tsv + - clustermap_familywise_log10.png + - clustermap_specieswise_log10.png + +Returns: + None. The script saves all the files to a output directory. + +Author: + Saim Momin + +Last Updated: + 06-04-2023 + +""" + +import pandas as pd +import seaborn as sns +import numpy as np +import matplotlib.pyplot as plt +import scipy +import glob +import os +import argparse + + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input_file_directory", metavar="PATH", help="Path to master directory Kraken2 report files") +parser.add_argument("-o", "--output_file_directory", metavar="PATH", help="Path to directory for output files") +args = parser.parse_args() + +path = args.input_file_directory +dirs = [os.path.join(path, d) for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] + +kraken_files = [] +family_data = pd.DataFrame(columns=['Taxon']) +species_data = pd.DataFrame(columns=['Taxon']) + +for directory in dirs: + kraken_files.extend(glob.glob(os.path.join(directory, '*.txt'))) + +for file in kraken_files: + df = pd.read_csv(file, sep='\t', names=["Perc", "Reads_covered", "Reads_Assigned", "Order", "Tax_ID", "Taxon"]) + df1 = df.loc[df['Order'] == 'F'].sort_values("Reads_covered", ascending=False) #Fetching Families rows + df2 = df.loc[df['Order'] == 'S'].sort_values("Reads_covered", ascending=False) #Fetching Species rows + + df1 = df1[['Taxon', 'Reads_covered']] + df2 = df2[['Taxon', 'Reads_covered']] + + df1['Taxon'] = df1['Taxon'].str.replace('\s+', '', regex=True) #Removing whitespaces + df2['Taxon'] = df2['Taxon'].str.replace('\s+', '', regex=True) + + filename = os.path.basename(file).split(".")[0] + df1 = df1.rename(columns={'Reads_covered': filename}) #Changing column name to filename + df2 = df2.rename(columns={'Reads_covered': filename}) + + family_data = pd.merge(family_data, df1, on='Taxon', how='outer') #Merging out all columns + species_data = pd.merge(species_data, df2, on='Taxon', how='outer') + +cols = [family_data.columns[0]] + sorted(family_data.columns[1:]) #Sorting the columns +cols2 = [species_data.columns[0]] + sorted(species_data.columns[1:]) + +family_data = family_data[cols] +species_data = species_data[cols2] + +family_data.to_csv(args.output_file_directory + "familywise_taxonomic_readcounts.tsv", sep='\t', index=False) +species_data.to_csv(args.output_file_directory + "specieswise_taxonomic_readcounts.tsv", sep='\t', index=False) + +# --- Plotting Clustermap for Taxon --- +family_map = family_data.set_index("Taxon") +family_map_log10 = family_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) #Log10 transformation +family_map_cleaned = family_map_log10.fillna(0) #Filling missing values +sns.set(rc={"figure.figsize": (80,60)}) +sns.set(font_scale=0.6) +g = sns.clustermap(family_map_cleaned, cmap="coolwarm", xticklabels=True, yticklabels=True) +g.ax_heatmap.yaxis.set_tick_params(labelsize=4) +plt.title("Family-Wise Clustermap") +plt.suptitle("Family-Wise Clustermap", ha="center", va="center", fontsize=14, y=1.0) +plt.ylabel("Read Counts (log10)") +g.savefig(args.output_file_directory + "clustermap_familywise_log10.png", dpi=1200) +plt.show() + + +# --- Plotting Clustermap for Top-10 Species --- +species_data['maximum'] = species_data.max(axis=1,numeric_only=True) #Getting maximum reads +sorted_species_data = species_data.sort_values(by = 'maximum', ascending = False) +top_10_species = sorted_species_data.head(10) +species_map = top_10_species.set_index("Taxon") +species_map_log10 = species_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) #Log10 transformation +species_map_cleaned = species_map_log10.fillna(0) #Filling missing values +species_map_data = species_map_cleaned.loc[:, species_map_cleaned.columns != "maximum"] +sns.set(rc={"figure.figsize": (80,60)}) +sns.set(font_scale=0.6) +g = sns.clustermap(species_map_data, cmap="coolwarm", xticklabels=True, yticklabels=True) +g.ax_heatmap.yaxis.set_tick_params(labelsize=4) +plt.title("Species-Wise Clustermap") +plt.suptitle("Species-Wise Clustermap (Top 10 Species)", ha="center", va="center", fontsize=14, y=1.0) +plt.ylabel("Read Counts (log10)") +g.savefig(args.output_file_directory + "clustermap_specieswise_log10.png", dpi=1200) +plt.show() + +print("--- Script Completed Successfully ---") \ No newline at end of file From 793ab5791359799b9c9a6017f4dcc5c503697c52 Mon Sep 17 00:00:00 2001 From: Saim Date: Wed, 12 Apr 2023 17:32:33 +0200 Subject: [PATCH 08/33] Added Synapse ID fetching script --- scripts/synapse_fetch.py | 83 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 scripts/synapse_fetch.py diff --git a/scripts/synapse_fetch.py b/scripts/synapse_fetch.py new file mode 100644 index 0000000..6446806 --- /dev/null +++ b/scripts/synapse_fetch.py @@ -0,0 +1,83 @@ +""" + +This script logs into Synapse portal and walks through a parent folder to get a list of all entities present in it. +It then preprocesses the list into a Pandas DataFrame along with directory information. + +Usage: + ./synapse_fetch.py -i + +Requirement: Synapse Account and Configuration file (.synapseConfig) + +This script requires the following modules to be imported: + - pandas + - synapseutils + - synapseclient + +Output: + - synapse_ids.tsv + A tab seperated file consisting of following columns + - Directory: Directory name for a file stored in Synapse portal for a ParentID + - ParentID: ParentID of the Directory + - Filename: Name of the file + - EntityID: SynapseID for the entities under the parent + +Returns: + None. The script saves all the files to a output directory. + +Author: + Saim Momin + +Last Updated: + 12-04-2023 + +""" + +import pandas as pd +import synapseutils +import synapseclient +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input_synapse_id", metavar="ID", help="Synapse ID for Query") +args = parser.parse_args() + + +with open("/home/momin/.synapseConfig") as f: + file = f.read().strip().split("\n") + for i in file: + if i.startswith("username"): + user = i[11::] + elif i.startswith("authtoken"): + token = str(i[12::]) + +syn = synapseclient.Synapse() +syn.login(email=user, authToken=token) + +#Walking throw the Parent-ID and getting all the entities present in them +file_list = [] +test2 = synapseutils.walk(syn, args.input_synapse_id) +for dirpath,dirname, filename in test2: + for f in filename: + file_info = {'dir': dirpath, 'file': f} + file_list.append(file_info) + +#Preprocessing for the fetched directory and file list from Parent ID +df = pd.DataFrame(file_list) +df1 = df.applymap(lambda x: str(x).replace("'", "").replace("(", "").replace(")", "")) +df1[['Directory', 'ParentID']] = df1['dir'].str.split(',', expand=True) +df1[['Filename', 'EntityID']] = df1['file'].str.split(',', expand=True) +df1.drop(['dir', 'file'],axis=1, inplace=True) +df1.to_csv("synapse_ids.tsv", sep="\t", index=False) + +#TODO: Get the list of Synapse ids only with .fastq.gz command + + + + +#TODO: Work on the downloading part and storing it in the directory. Possibly by multithreading approach (Discuss?) + + + + + + From 480f8d921e7617246e7d04a3066dcd11fa453c35 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 17 Apr 2023 12:05:30 +0200 Subject: [PATCH 09/33] Snakemake rules and ReadME updation --- README.md | 22 ++++- Snakefile | 2 +- dag.png | Bin 0 -> 23890 bytes rules/cellranger.smk | 3 +- rules/extract_tags.smk | 8 +- rules/kraken2_mapping.smk | 9 +- scripts/synapse_id.tsv | 196 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 227 insertions(+), 13 deletions(-) create mode 100644 dag.png create mode 100644 scripts/synapse_id.tsv diff --git a/README.md b/README.md index 6b2fdd7..15d8dfb 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,31 @@ -# Snakemake workflow: `Single-Cell Virome Scan` +# sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Tests/badge.svg?branch=main)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) -A Snakemake workflow for processing Single Cell Virome Scan. +A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell +RNA (scRNA) datasets to investigate the possible correlation between viral pathogens and neurodegenerative diseases. + +
+ +# DAG for the pipeline +![Graphviz Diagram](dag.png) -## Note -This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. + + +
## Usage -> snakemake --cores 8 --use-conda --configfile config.yaml --latency-wait 60 +> ***snakemake --cores 16 --use-conda --configfile config.yaml --latency-wait 60*** + +### Note +This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. +
+## Contributing The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=maxplanck-ie%2Fsc-virome-scan). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). diff --git a/Snakefile b/Snakefile index 1a42b62..d5452fe 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,7 @@ import pandas as pd data_dir = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data" -accession_list = pd.read_table("jiang_analysis.tsv") +accession_list = pd.read_table("SRA.tsv") samples = list(accession_list.Samples.unique()) """ diff --git a/dag.png b/dag.png new file mode 100644 index 0000000000000000000000000000000000000000..276925b149664d477af910bd00c25d861bcc5211 GIT binary patch literal 23890 zcmce;byQW|8!ozN=`KNG(}FY-g0zIBq9CDkhjdCvcXyYBbc1w*w6t_cDk&ZN&iy;* z-ZSpFzccRr>yGc+%3!TM*PL_B`OfEg-sd$$Raq7n>me2dLAdgA(r+LL2@`^lsUDz% z-&|%|nuEX44HRUhAvof{?AF3K2%?4Lr6u0FrvF)T`~3E-RqTE?#dF9gtJKICQ;%#M zi}igroo@`kdb?cML+d&(%O|4a+7&0`H?1o_KXRhkU7dLy8|8F|UTM1)V>0?))1^XyaVE*9} zOwLID1m=0swEn0sVG<88UvW?4&ZU=`Rb`56z%~B~aTttSe;JyjiTw3|FkZ4C;MHTl z_fGsob!c+DI(V%-@BUOv_bb(JR4;j9B`kT%6y80B_HGy^34<7xJ`XFg_{nRqICJNw z*vieN|Gg~OAgz5Lg<%sM*PX5Cr=0R2K#^*O4wKM#XcE7<23y?}o5BHOG0EhTj6yL$33Z{++LM=E~Kh&qz4 zg68Ox*GAv5=c`J3*j4plfvDY0m+_=w%<@6OBM(jw!*=wYdzdZ zi%eRje2z01tQpGovwk>utfLWjtkf^6v*U&hWykN^8UIpe*eAn!Lk)ZLbJ`m(OF#@W zXaNUGexw^Z6e|#-^KKLE5%cSn3m6RnYZKD!=(P*h10F+03+zg(nGMxJT*wFe#iyLM zL#ddO&=zi~t}QFsmp9nU7aeJz!{21J`f{`g_ImzoJLgakOvFw2PA8)gouDlXUjm9$ zR>Y5k_Fyd)hKL84`+988Z^n?JH~)I&rpO+AB!NxPO7LPlxu%1}3-55OMtA20xak$%L?pXH%i8fXXcppHj15>`_QgVDJb&Ve~kz&|F!e1+Yx#swtd}}^7Gox-x9W**RD*wr_I+5c; z=ysO3sKEmHX`Rx${FStnp$pBF%s9>o>LmZ@Y!-$GFp1Z41d_g33Ualzwb`|`Brf-FCi6Gn8#r&`Qs?&RiP-+YZk;^c@x3CVdNVFG4zHrrgd6X#zSOArl#~9 z9C%;9el4n~z%Rbn>+={>c=f8%dV#<6eHAh{H@EBGx>;YFF)Mz+-QHM%rKmL zW@%}}h=;PIqQW$syWVm44ZrCq_4%U17yDw_G|xnuCz*;y5HrmHVygv`wHvgn?Nn!0-5O!XKdM=vXoVAWJXMK)^UybKTg>ML&)M;HUcwKhWRD0jedgHo|(npC9RW4nf zZl8YHhcEN-@o_<=#l<1sr3-&eWb$G_Gm4E~S*pvY;I-C!-ke9*Yv}5ZL6zUWeUm?A zoAa=T-(A!wrSXF$Kt(~J7ZMWkCqiKP!POZL|DBYihcn8q?DYqz>>hgYZQ!p&-0RUGFqL1UT)_taamy`A=K)nr3+h?w2=n z=w;pC{IqD@F1opq<#<5yqYeMD2x9wm>v8#P2wKfoqk-qZTI==f9UP8>#BPz`C!BC! zw(>?~mlaQew)<<#`+L2cGkN7UzHFtu3Gu>7iHcc_MgCi-?ccv6F5NDjk+SRS$)M<% z7^KLJ4m;{id!bGRaZiz!bu4;8QL$*lD# z?_2us-@n@)txB+vU@baMD#51t)f9yoz}DBVnIB(X9$nemSAl#qYfL(S+pn$nDoCkw z>NIw&V7N-HgOT>!9JWVImuS#Gdq%puyX(4FTo|3491Pxy%D4LC^QD=Yzz8aV_2YgT zSp@}bk;nMXhsz@u-r*O&`Wz;e1lBTKmODZ4jC@z3zV^$qB4Eje#-&|t?dIraHZ4WS zUZ(2LA1e^W9G#tg-L&IxlMbbL)bBd|Pj1kmNNj6rWo2c&MH_){B_+(Q6?%t*zG&E$ zMuS8xm+N73HKzDKhK6$9vPylkE=abQ+uYvXNVP2Q9P>Kob#!#hE+{~c5;=L`?fvoN z@RW@c!P*RpfC`U!UaHH)}f+d2_8EE&~CA?q0xw1P+b4dTXkmlap26aIcoT^97_J_It#$NJvQZz}#}B zD9*LG@giQ4dWqkRSmv?U3US@16;Tp0GJ}!SX9Gh+-66!xmG-}XL|;5x_IRVNj-H|z zn82oS62!I{OQ^O7|9Zj?iOUSXyHx*0_Bxa!x91J*0qP~a7wvG*s-YML ztPpvys5Zv1aNw+*oaD_d!I&5OWH+8uyN;J5pS=89Jucn^{iYx>ASWoZx3}+Ls;Bkn z@jRQ>AY74c)OE>U=|5hMn^4Y(6Q|cAYAbFGSn~k`39?rX zXu_oD%uVBv!Nn}2Lvq+?$L_c>`8<6uQn2s1=C4TIo%9zz0zXYpJ9@DbFHbj1B4mg+ zl$oK5)yhK(javx?@@8P5vV{^n5uS8|MiLDn2wbQke)Y1PrMeFXQ>d`(XL!)%Mk&CM z+21c7bz;#=hko-|kubi{gu?L%YZ{_XDN=jr;S zK&v>)piaNZjEeQQH(?Nj$D@;Z&f8NYGkX1ODLh%lfDd;|<;Zx!;fRYFqLptT=q;du z$s{~JM#dyeGp?l6ZGhutoKHd!WnoWle3o9KUCKZLCVhT1Jo(nVZ^UP;?P{$NCf; zx0UB*4{LxxPV|v^U4}RS{FTCapNy>~nO@}u_jlz-Bpd5QDzF8QQO99nLbe{%jb`^0 zXPz{ z|GW{a3Y(TbWyIv)*%{iRNzMG_Ve|LzTwGiPw%%4&Ps3&MXwPC&0wg7U1tX=Dltz*- z8tUpI8m*7ldt;u44C^^I!XyCF9k-~1iG3`Yrek7a@+FN)373@2Rb*0=eylJc{aQ{A z3^fJ|3p3>Bt|J!(&W7ly9v3*&F;xjuidh&Kp2m|Q*U`!4gB#z!diA3*DkqvYMg|7c zXHCGWs;V+vrXeTCsMBKCduha!8y(%K$Bt(4k^rVECzn2GgxqgOZ?i3kCoeBg_g32E zA$W=!3yGYZ9EF{Sq$G0utFI40e*z`xu)>|+O9VZXW+5SZP0P(qXy23$*JO%=adW@! z2^~=E;_geHd#RYPWY|AjUvX6T*HXuEDzAGQsd)h}u*loIW#3(czTKN2%GUBgqUmYn zw12tQ{l!nYT&kLma(<7RWH(H;l=F+Sj9*Va?Ki31q3HlIr`9hjBeJV{cvV9oMbxr* zGmaH#P)pj8SdrRnh^BH+vCy+}{*QiX#O+biXmzPQBndF?0rX?%3++qo3td@%sTR7& z=HK{w1`MHa5jKJ{!M?TXR7oGCLQ`)Pd=Y~r5*Zw_1~hg*_8q%3)Wy85srOk&9~P5U zu}BMvirZ4y_at-8D+z_uoO4I!f2<^ad)4b=P|@9hqtTV04n%{5gapW^$zeNGM}D(b zc)@P*TIhb_0!a&l^Da^?zSxyz17=jj^Rm>&B=lRfps`p$VOo~&(niKtdCu958_b5) z>&s`+BR-;Y%gZ03<(S0DM=0IQTiyoV{i4ZBCXkOzLGM=W$ou?4OlN@s(*f(pbj=n zEM)w3t#%`)A#h3lVHZ@Kj%!a6%D3(w%4 z>-Er4J*`b}v&81N;P>EfB+EcAIGASm^fg?mq>Aee<8Jij@X3?Wvn({|={S22)8r#M z4e1`|=3TsduDt4YqAX2O(x%*9m1@}9k>HUES1odOU{;Oz_wvmHrioX zf3L#%Pf#~+E5^?drRuEjQ6n@t3)8qzBg}^V=eu3dWX(CNn6Y?eQQ`~(}w@D+{8W3NY1h8yXtlk3iOo!Y2a(+ zq1hO5nI^(8++~S(PLa>SMd@<0>sTYX^nJ1Z3$4kW=R9O9E9fZQP4^CT8oZwJJPZ%1 z$*&Gi`L5{HCG($ZWb}Kl?3fW$#qKJ_!X~!A*XBCM`fk}VWCwIVB!v1= zW!PV7Im=;WVnWByPX_wTDUzvhsuy{o1(buMoxCW6tCQbc4j>4 zdE6-}DZK|QKU{{M?CE|_0=rY1yV6leF~C1R``{)b_6{BohTMQG~c4`Q5uVnqvP<4_O6&P%bwiLp4~^WIbG3Usd&c8`Kf2@K8o`2eyz_Pv1Gtmi-i3ce}z9L z(wvk?MdTP23mY4;RR8_^7v8wy?Q~FI&u_KZNEjt{FQ}!ZwR-GY>2YDxs8ep&kdl#s z3%JMVs4~|l%W(Mj-GVg9@(Z#14Y8wrUHC%;`v+bt zwW~ZX4x;N*5)uN=_Gj~2TByMRRk`q2(Cb$4$B!SYw*LG`KkBU)Dow{8f4)8Lf8WSe zP5eFZTI=G^ydNN*56HsMgk27#4GrnR?UJA)HKt>I2#I1OO&~8X4+V6`bu&h`;NL^} zO~sOEs3mOoV5ynBM7^x5ZuX zG=jNyZ?R>so4-EYsmBLd!6+Db*$oZkS65e!$9+_G`(+Kt&|6lmuL>3nLSOHB*;O`* ze)B>dB@Vj8O{pn4AnpKNjey_!7hmyB;T{M$!r*8BgM$Oi7l#-L4M>F~ zId7)qALM~y$JGubpl;}R-<=f|7k2{HB7DK7S?N#p087ErnoMm8AbR*>f!+y8Nnsy8 z&gr-;ISINR(N@>gyat3DC=>d#=^n%7j@>zKnj~P?F)oF5n!@#KIK=C9NCxqIG1AcdPe~LJ9N`D@HqSwx#{|K8*XOPMj z*Y4nKv0rr8*|Z3L>uGaG?c;(}h;033U|LZ(76-FVF~YzT6Uug9MYg~&(-FgbE3E35 z`o#hg5Ah;YN~V?o_dxF-xx9ei-7BYtC!(If(;7K!>Vgv#y<>4hBtO7H|MfyXyyv`v zCPJ+CFQr){wM4rqZz!P-$t6jWJ*=1IX6|uTa!^OX=E4=@mhPTwjIMF|u+CDsP0Qjj z5hOl3Xjiz)5yO^x&+v#S<(pz@h&Bqe<8-zzpd`#C9#%R09wig$w!j?2{X;9{^UTgN zx;XXnGntVAxE-?9{)t~H-T*8KJ6;WY2=tlvCOlrGyf7xd$)UY1*# z1cw5e#&1e>U$ZMV{(KG#3Rde`8{M=A_>0xvm?0J*uJx~x~bIpH7I|FE=^0uidl);|X70ccI{y+OTP%r`ff^MMN9C zM)Bj3TTHoUddfPj<;8q_^X75=$(&$olRR_b8qYuWY1AK$I*8{Rf3blbD7F#TOlfY2 zB}+?B1^SOYxq0$={N{06wI>?0L+FRm_}_S&205(+W#7osr!&7O*-*jdy9ZLJ9niNV z|GwU^^!yi46swji-Tq*Gcolu$s8Qye{%Ef4_ApUoxMJc zllfuXAJPTwkkv~_UwtLg(DgjauLszqgVQ-KDJcl(i2*gzJl(2enf{GSHeRHTd3Gm; z5Ri*tNghEp0?(d>@R%~Chyq5`W5s_;vs3Kvufrh=&Q7Rf%@*}(q3nOSRPemjR(n)L zM8?m05O1&QRHz<4EU0DYsR8s!4oGCsVOJkuWNN<>W%48pa$U+*!+MDb#Qhaf1N{Bp zKP7|26%}!es*B+kw;H#PEtTF%_DR8Dg#2l^97)(6Q`hvjy?uQr?6Dy7*BGsX z4t2Q{A^CGMK(zGqC{PE5lo~O+ia1lGq$FL}y!q~!5syUKIi8}0x%rwlo4m6eDG3lsCGj=}NyBa+a8krwGPOdd)SB07iX8Mm~Ba&>f{fp&N434~BEpVE%r zM()|s1Ste}g!iAzFdyo?lza(;@#IK#8Sxx%aHXcDtvM@r%&^jlVqwN$PMGgTfxd*~ zf^SqX=FIM2A*sY9#>8~$8vmM`lk+P)YO~8_yK}bN0g$CKulAp^o-j9vrCtQ z|Lk@`hnV3lxama69orROz(0R-HyVt&L%8urJ4yn9H@HN^%K#xdg7`}F5+0;l{8p7U z!S_%MBuVfK&(EE~MH$dbI$gQoiO{2C8lCCI#iv@2q`$K;NdH_}8N)XMO_;Y9mNb9$tftHtY+L6l$9yfY6bJXmJ99?19OxSoDfc}^D&bk?91wF)4`+6^TswZ+Fk! zff|}uScuWu+Inj?zO`jiTUXb)aV%ak9je1f2wwLCpznIWPIxPcd{ocm?(Pmjg~}OS ztE+k)cglro;nl$dJGN4&Z}y^&?3$M-C@5An=x81j7%s^ah*`)7cQo{&)83~9acbKC zpI!jmpzbdP1qIRS7rBW&!_qJqGCZ2#RIt6Z^$`%6vE4*5u|iTp@C2k+u`v@~G47>A zY;GEPrQWHk$_*=mu^k2Y&zWKiMuKeHDIpCF&CiV|3?@uim|vQ7IPSX(Cd{PE`-Gp~ z0K9^2&gg(>OHxwr&2H4uHg@Q-Nxyyb5`Zqq?_TEng5S3VxI(-BsH+(l)kQ^Si#6<1 zsmX3E45(W)L>6^gJ+EZmuqLpuvSPVYNW-{KRplNGd!I8u`6rK4s9y5=?OVPrMNJYS zUhv8~KEpcgsKw^Prp)TJNXW^fzPoY<^bCXLz_qd#`0DF=khnt3#|l|}{k32_YQ%RU z8cC8X7zvA|aQl2}GMqop8%Q}dphTEpTxUEMd=aPp)iRNl%yPq>Ht6CCgZ0Uld05(aZSx9>>B=xfYmndP9*Jro;o;-DMV?%(>Z7m}RIP2*vB+L!kW z0yy7}C3LCFqyC~{pyM*P3US?hc>J$f#H>Ncu4ORPQ`be$s~XSvlvp*H?x2o)5-Gsn z=e8mj$5amVsarwUDSP^%YeIoKOe*ko!21Z~SMzW2tPN}@@?;WANK@Q}68o?k81z}L zKRkAv&oHYJTr4_m9wWs0tNF zS=-llKPp8Qf{it~lQXh=%z{X|L)*2@716j2@&!Men@dWgFA-KoLfF6`S;dGIn+MJ^Dp#A1f%cKZ;!d z)+0z=0HFXdLm*Mb{ZcMm3h@@6^5JN9xyOk67=ls$O6p_FzrGK_A>6BXd*DYWj76zO z4GO%5mB!`|(RxwGpQs{TUC1k*+vctO?u=APz4A$pE9&IbFn%o_?z`c$Z`!|B^~toW z=)Ar>l3%M5{x)bJ;Uzg1CX!N}N>{g!X=MGXLDmJIg?=brl(&a^4<2Vhm@fnSV_j}+ z^n?pf_x;cKqNRU{o4)Nxz+lg)d`GLyl9=o!navvL>vf~81xwhy&rIN+|&`&x>%V(UgVE`u-X(z~ZM z`zmLj5tj`dw?*M}gydW#g@cog*5=|Xb{-Q?#C>+!aULsqvZLzce0U{`{IrxPU+$ic zDO@lra~|v%p804q9{ets1Q$`*RK(Fw))w!36b>yWS_%+`6!{r$E82+na`V3` zB&}a zjbg0mf1`YSvasm-?zFE5xRPEcvdGP!npHu(jEoG}w1}1xg9zvzouk|$;5~wFCk%l9 zSWnH&tb-I>e~wfT1i38QVSvQ6?d6faPKzrc8X6i48(Z4AK}S3M?(i5vGQn|zNMRk# zXn#7c`2ycn@$ORKZ1qQwailS~tKUh-J;W;6b#zHA(OV6Tjk1R2 zt~SpbnpBi)vC`dz`az)LW<)*UntJ}FB!_*ZUO&ywc@XUA%AE3!<%&ro>+bC>mYSNX$3=m>bvryXlxB+<{2JKV1Y0hjAMgIY1G&3Z z=jPWRKD=0O`(LSMZn7gMkev=_!H3U||JM|@?l^yDcsS1cGW9k%5(K7_Nv9@0Nh#a} zEM!6+hW~A{7F$zO^F1(|fVAgpO-;<^rO1u`sJznBr+U4oAv=eMLH!#NAl2}el{E74 zi_oQJm!R8Q;5FFzR#M)zHIgsb#6yUH59ppbiv`ulnW%wTW|K@rWfmS8GoEy_sIX zVQNxV`)GcA&$MFcxq!h!E{)5M6enza9&NuY#gt{_9wxm+Y8dE+q|ekzf}d zEkW*4T|WexD$^x}K_&?iHEMq-w9_rnj%1T4w>c@^t{CzVwF)Bw4Z|*nlIE;cPVq{2 z@!+2iPoHS2$S!q)XbfUWOs>WWe0OxRbX0%R#v#|GLzs?FII&L)6(A2gY7CN8N66<_ zCX(Y^(KABe(fg=n+1;Ru|DW(yU@v1hI1aI*uimA$alPYnaHBZt9KR`2A5fKq5t0B* z!^GQh_CpcH;Mp=^>Y@5s_M~!INbzNM0ePAFC-0u>Qi`3T4IUPE7u0o2c||6Mcpkil zr*f!};3CdV+X3}9rP0%R2?4y!17xUZllsRNbFfa#KST&*g&{#JgJDw2=?#8=&GOJ> zIkdXe7Zf|8+lvtA_teX3gDY%u4+K4=Wn^Kn=;A>CaFie9DXZG5pU23{iuN8p&~W4J z`ts?yUFif>c`-WUtL@ipbR=LgA0IZRTd4wMR8=epvgQD0+}2{Awen?2vMd+x_3Dng zSFlV!F?QWa#VIm-SsM-JkbxR>W`C>4(8-G$%Kp?5fjHthQ_D6)EVoX$R;Sf6T#rkf zp=Gl9vBZ4XWD!wQHR7<1whcgq;*%H*H4_PaTpkTEi&^)T6h1ds8q!Z?_q&|#kGNbA z`h+d{WaDU(O|42WLfK6jHA0qDPEbmcW5X66`!>Iy#qP+FTmd9W;r|}K)%sL z9cQyTWg(tlc=2+-$p#$fN$tUGGVJ|+8ijeZ@@)1_6G0{gL~`%mkph~71OcY{EdIyQ zzi%{_wd(<#3s+13CnT|_GkYPdC zENf*I7v@k+L~J33tC_v(qnt!E zjW-fs|2597a!OC;w+~r_UtIeRE9XbQgeDoCG;4n?i6t8~z>EmsXl_i^>C4i||6W8+6zUHM-?}}NK zyIKP;FVU5p1z@o%N$B&I1NcS!R*LQ?J7&&hkiT+a+R1tRE5>$o7DJqO@7}2`Go?HS zTl~QUGpyD&2o=gNC_o(WkRE^9#Lu5ZiGO1U^P&j`9Bt%nsrxI49^*M|4X;Vn0r7n1 zhJ+HdU0|rtWm1C#@jX=P*iU0EQx5|%7@}}+a-^e1#EynU|Nf264xfuRXSR-7cuBw( zNYjOI4V9Fz{kfsd$w9PkDGyltsb5#uG|gJnhTw39hdJR2IONO;%O`>XXqqfzBn(j= zS_6j7n8d9|9&5IOsz8*8xr<&!K&2(%47qvZXSCY5iC9yjAdLZZL~R5c`IdA^IC4BLC~ z1tB^ZpkU+%zEJK91`5NS!wU`}ia~Y%c7$n7|E;#Rf*~_1V2QN4mUO?Q41ln3s6oie z!1oXr03sw;{8^BtK%3_TLySLM|7wPhne&`IXlQuYzf%u_99>-UYU0SA5FrEXSfHRl zx2WMA_(QaQ8asWb;EIU{JG&?ngS~LID6_*zi(f*LRs;=CFrO>08q2I|bJow?f z5gEZx5okUr8o`V){6c`*79Ah&pP$bFEk{lPYvF`N$^n>OzDJUhrOHDSzUHq1Hb8{0qIzJtEbcq3Bh!!3lJhLy5PXgKnmb$!& zW)2G%7lB_33qbq429pvE|LHZc9IMJgqW>YliuJZ%82~KVzd7k||3~BK7#J^0jZS(* zYq%e_evASr0b0JOD=3iJ3@tzqkt_IiL|my<1ZbIY`{ung6-4us#?P0ATg? z^*%Qk7#Q4KT(tA|q{RRQbmTgTC%iw^QVswk;bf*!(ACWnpO)rCjL#n z4LR`DF?eVk%ZIJO@mG_&;ik@IKH7 zKsjh?vV8G``hcs%*9;3_&{2WjJRt_3)jN_YHzEFqij2Ny*qAgDV^&<~H}f@>o>z?2 z)6=$m{9ThP_+$k+f2z>XxO6?(lXdW*op0qbzuVy+@9Wu_l#Bfp3;mwcqwL*&-XUlA zwYmSShlPR|CxMbDk!04q9tXMmiYIyNVGD|>tZdJDp0Eqvj{}>3rJ5)Wosf$nj)?mU zqu0vb_;nBoTqgPYgTd&)FXY>2Q$)Z~K2nc^gh-*)XfzVb4E z4wLFS|DCksy32@zm&PDdFfEx7gY?qssbIUMZWj!s71^7bDyeXy*>xk6S+b1NgfeVX zMkuH)+>m@~@=zXCwD`7NtjL~ZxJb5}P59T#!(c=aGAMW{}E6P{_Yv0diSl{JNBML!2(f5xE0S{&E>6V1>r^iI_S-9jZ4Lo@H zVi`r`@q5ub62Fm=fy-XbY~cLyRzr?gkXaL(2zjebjDmu48}KGZaLU%%d40X!`m;ET z5|h#0o`zg2+^ZA0E2~qF;B%H_G!+h4LK3~@jWf6m`Q(-3k<)nKfO-eJEEUjacyzQy zyEMZMy~y32*>V+1c{vBFza|{DRVbx9vxBZMQ^F_A3?HUnuzgcco~QRE2NpJonX$g3uG@!L=8MZNghX z!jvXb3Zi=Tl}GPaYq4$G$d3C4sE|?2lj!TlmwJqrl%*m7-r_F)3)K1X6@4&&4=yOA zCc)R}l7_DEGpD5J(oOlF<5T;X76rtrs-qjfdNo(C#3Yjv>+4_51NrouNNs;X$rAqD}UWfT-gMNw9AFW2j@%CRyy>wZ{e01(V4Ve+)lIRR+IUGiI(qiIv1K{e6meL5e7xRDo2Nk% z;Q-u)=Z_*)fd9a)lq74RM4nv+E|CxjaicV{xrg+Y_OM|IHDTHht zXR!U1lfj5bv%_4AhZh7)#UC(OfD|qe+JO*2D=bW9XJ-f4<4Yi8K-o=Aluw>KLF&H{ zyPk*wuHsi9?X>8;U@d63j*75p0Qmr@OUOs(nEx*t5K@r0Ne!iTy_JNG4afS%#&KNM zEh>MS^{{97HUJy%+dc~+tSOuy!|7!sk1irK6y)VG>ur`oe-9-A)4yJ)FB+nX2h>9Z z3=b;;>-hSz$04qp@tGYr9V-aNcr>EJh!3!i(>e8Y_sv-aA3e!vL?d^T~>{-GiA$$M-m|}D|$c2GCl?I^J0!Y()-~4aNNQAaT zeV>5BmgD2&KcGD0KMx1`)36y|_4)aEv<2eY|F@!-ARV>J75q8|X^>`MaC-Rm6_q`jygP4Yl;v&LdxUW}KYrUk)vA4vCh(~bxW5IU4p4G&+|Yi9%h_^-jPMOLHVQ5RlR8)* z8UHMStNy0apcf0+mPBrkx(5aZI@nb7 zW2{__y4c#AQQQLT7YvG2?i*J`NBImSqkpDE{Q9K@-03ptf{`fwcF{w>xza`e1m2(l zWgQneZEcUk$k@Apt|9r$SzTSdV+;fTaSt#~0RKFw;i$Ko_XSWE694ew5vb=l=@xv$ z_i2vL6TuVieW%ys{+H(kS@J{Pg#X_xJdj7c6b-*89t(Z>3KsLf94mduZ8qO`K^O;w z29#Tb0J-z+yLW2{r+Wu~Uur=5DHZvD6}fPehR~pa!ND&Je`TDU&WaXNhZVQmpnqVl zB}`1V8ta0;08{B_8kxT2omtyU$u95gr1wDis`R>Zk55X%17_shQt z*~<4%=b9XtA5u}hR#)e@VK+eJPvP)60~-6wYld1a>ke=2s+QZ6Ar=;v4&dw3=Sit_ zJ2e5Flqyr-01FuLZM?@K4**^#Wm8`zZ@)ydX+2ZFce-6CZ|{KLj=@3tF4Aoa_Fqjg zNKx|*pJ)(vB9oDao4BXtp@wraFf$u=j{ojn(-U$hVOM#92FZD>BK7!R z>eF>SIKRnB^hX7mN`Q~APLB;PWC?Z=V-Aq)9Gjm25sqmfUxfdq|ukH-!ax(%}2dND6n zA^}B{2o~UlxdKW=tmia@lCCn8oo)gMW6&E`Lieim3)`N6Cd3$eC^APVHZG__q;TO+ z_R~j3^jAbvQYG3~LVkYYV+IF5<-L@JHQjCDS=K)$I#^6>7d&VWn8*GsFst#ZYGQUQ zgsP@0B3*#Yy+1%)l52lu$>wd34RY5+PE-L}RJxF&0%W-ga&fQ*eL4r=^-cyvf*dvZz2I*lhOkKH4nsa{y z&NxoghM?g!@xm#S@XRmIb=}accH%vLmNi6v7i9ycf=f6CIU57ri(u9R;gv7XnK67W z+>ydoYOST;C(_I62|43H*`tFJe<&OPfshM+&yERqdrBwz^9_?@4CZ1xgUK<(O-}tt=SG z)W8_CT~r}EIht|X)iIdHFQKiDW9=IyUabZkgmtlXoYiorwjBd7?0+mW7wpN_bD(dK<5 zwZt~fJ*1%$bs-ZewO-ahS5^8n-$@fOBt5R9740&0hN};!1W$4~`nv7pTCPlrczMR} zyRdD~&by{3{bu0ToQ`eRtT+lulDcnE->H+4R8{3==F%r|6%vCP5ajsiUzUqJlP3!x z_wPfEp59K!*$-UdJZ2VvVG-rc?gJJ{VOV+nA%*-2UXWi_$Gc%61{{S%ag8^=nF1@) z7rJn)Eyu3ug|KCpWr;)KLq)G{d!gih6r0qMW3JcfI+1hJB{8-7-N7Xy{QxR)^Q7Q_ zO~{|71ndq!*E;|v3u#zH-l&_+y=cAAyV{KNf%_8o3gWd&ZQpOekE1Asq5|kR@VJK( zVF%ayvO0~VC;VMH#F!|4olx2cIv@^(leTVBo)AQRi1>E_C?~v-pNz_U1fXC*1Y+c3o4tRr;w9_pt+6p=%3>Ro!`EGQDcQo?&tg%@L>|8 zNnKowzJBBjtR}ReAQh7YUxiRu=Z2u!$w@Y4{jYZ{Kq%5TH%D6i8=cYY zT=wsyZhW>{HW6RTDNpQY7%T}<*+u`IGY9A+dB!ZXBv@!5Mn0h}00NP(KMv-K+0gC@ z>|0sc@7a;~q2G;vs)q=OiRta%O+ne_X>`MiP`4kKQPiYOAXH zZ0gs%h${w@!2*-)nvIPfH1gG%KYWC805NGtCG??l&UQ)(-I?YY^cf4a(Czwi)J$nl84%2ACu z5QFG=dDS+*6#yXv^YoFut4L<0iX?GaMj{~ov-z^?Ibg;Z=u3X} z0gD9K7rl0{rQ>|D4wLlA=#`RT>r|LPN3I@@Dk7 zvMVd24?7sU@}h&0+obSHrhnkmrDDOO$CYijHswxSMJ~6}K3#H6?VkZRO6VNxHUl3x z7dyy_)>2JY$cY|C$nd~(C=CTYuld}AO!{Xm>cD+Oj#acg&#^*S2u8gy`zX8 zw)p@}41Kk=v8<>lxVLy1EiWIk ziQziJ0G7{0e>0VN^(7PKXag1K0?jBVB_msc`nHu07Q8wV)h zkSj`Vb~XfRs(oj~nSvzgLU9h0^uj^eE6C3h7OG@zMV%rYTKzw>Twx%wNyv3} z2Z)Z@-VKYS-+ErKx0ocTHV3EKu9KZXh6|ViM{c~^-IJqkg z>ZF2iKu#FBpe>8n6i>;M#u%L2R<^bTzy%Gg^#FMznLd-DVgvE$>hKPwch@nuqJnd^ zidi|`M~5RwxzO)p1^81+My4;Hqu<8N43Po_T!@E<=W@Zr-Q9L;m{OGXE<+q$;3)RLe%yx@QUz8$uoHs+5o>p*RFk-T(xB>y-1tMIEmAl)5IwuB_&5IT zPTmq6rKhLMTW1AFfq(%lSBp&!$I}|R0k_m}>|7@IP+nkMmyEhFx3a>yZ2JEFA=r>+ zpFVBl2LOK~0QE15{}La+?*Hl!e2L0MZU-x|fLXxCNMmASd5R|@fxfe|vjeKV92|JX z&O8D80Et}Y7Hh;}AWBL~B0pNA348VRA-9ATOBrV522*Q?lA)pD_-_oS=a0%FxN#AC zU5BF|yoWwegD*k!N)A+=DDHhzNE!+N3UD?;0Tu*7Z*G>Gu`3sBE%1uv0I|GI9`yH@ z%t`+DJQW~Wq(a^PpEENu5`@257;<|+Yrc`z6xL*g$JdmAATT|9kq%_lem(zOge7P+ z9dqb5UD_3_Mo>i!6A*A2zLB+*zvLtq{Izk{&!;Hu;c?@?RSP1oYRNRXD^&T5Ow7N3 zjTKGAg7~ix>%wLH=A| zFUX@5f6eTU$i3BT3O1Fqv$u!!KIStYbwA?Y-q85y#<(?_p@2mA8}Wr&Ep|jAA1plE zoA)$56W5HOMxVJg)1*?}pOYGdV)M)&=Kf%_gSwBLygd3??~Z8luP4c@AoE>$UY=zwD9kGECRFbp03fzx0!4zH68I)y_kj!M%bah(}E& zZE@!T9JW zPO4PA*^y&GqQ-=d@~22zFrAYkMsA7obG02&xapkmC^EtSQOuRNL)HH8LyRTcASQ&- zB*`8cyKF<22$Af{5@KxGvW|U?>|~7)g=F8iUS(ghWJ#2L$eQ){%=>-+ho9?mU1!X> z<~im(=XvhW{kiX3&@8F{!7^Ks#W!PEoz1!|@nzP4uU280!eHwU9>YXGlDo|1HyWCQ z@7yxH7<}r=Gb4kP$@a>0mI6HNa&Xg6DsSF7-2Ig3+7ni~lonWK^cmxuNF#cr%hc2m zcJ-2BMjYAqeyj_R)o{)zw?ZM^r{ax3Huu=S+Sm)9^-K^uSk3;)4-M3xb+SVeTs5AW zZS-sG)+gpBK8%0o5SpGb@zwC_Puv%a?wmTTQk41=Rb`igU`8ZTH}W~#?ATaHY%C4%l}Gbspy6bMiB61y+hLYtYHe2_>_8Igmi?{5w)ukv4X(&<@h z*E6PZT#$yZQ9c&WxKQ|swY;i|K_y?u#ooT2tZZ5G7_&RxNG61rzv=EkBT{}w%4n3% zX~4^a>TX9E$@;S-7v@8d7B>d`3*lc(G_xEn#0PbQlLxYcMt*Gw>ew;}uT7uRyb(Cu z<5fa1(pxQ~t7&EQoq*l%uFP=qvk(EV+|T-&W}9~|rpL)fob^{hP?9ji!@0AT_#1YU zauq}2UvzfIUU=T%)PsFJ+HcX7S4uU$6#UhmJ)~cc972fR&RaUHI6@s3QE00Trrk;q|UN1)*Zr@c=pl~Dk%>$iB z*B*_}qp=^K+L)NMu|~R^$5%R6q`EUvEfBke-Nea%9;=gF*>pSd+>hJ*NYRrK-}uk; z%wNBf`(rDOr_C*HMJ-;(UfCBkF}I1xeLFvARx`oApp!~jSW#u{539_SqOS=t7v}#U zpAzsbb)LC??*qE9q zK!rKDcC;7@*!;BfqP}ay)>4(EZX@tg-QF(^{T@qg|BCVX&$>S__kPb`pS!jsgbMQA zZ1p8Ss?|%nncM8vZB}pXGxKd=L+xrD625oKuWSesL4Hm(kx9`eR1xv> zbWEoERD!3bv^2NwUr-DRe=LmB6LtNi;V|1M@h(=FMNH?hg(}@kzR*q~fI4!4MF;S4 zsu~&~Z{o6mNj(YYhn#2L-jcFFr#84h0I2xQ2E6Q{uPmQQJtM)-G;&XlS}lQK34~H* zUr3tzpSlwghW=wd!H1yKf`FzTXpMl%Y|x3w6iAu={BZ&7M;kB zbiYfmLvhK-m=H95>~dD!R26n3g?`6;0KEZlYMhIKXzSs-|`aI45R( zcQgqhCm>S6lophaQC&em9S9_jG@dGoKSDTehb{vG5()r1IMoCawS=6pr+{8M=5GTo z&Ve7pZ`64N1iHbBN>+`hVBp?8_M86X2%4UOpddo2f#?!i{u%?2Q3bRVAh&~eCu%0( z`!>Lw<|1kF34hd!icU^W1~(cTrF{Ya>TD7LI!|dyK%vMnKR6w(oQwTczK0Ts|^6F%b?zO9;s+AayEi4v$e* zCT%no3}{`Qm(8YS1~=(P^E3CQE&CddL0?JJ(d?N!v#Rm`^FimW)mc9@5YX&@L{xt zGuv!NAZUd$OY1stz4GJ8MUDNAR*FtzcYxwA0$?k^Lo3?+A>Iee823OoX=L5;KO-y0{Vk1>g5zueoyH7ri z>W&8$!eCOWeanM#V~NF60mBavn52pzB?!zPA^$U9LJ&>wC9RT$v`$0EmaW_%f4 zmcs%zkfxD>JvUSRI{MX8{910@@6n*A=BgxJs9(fTe>@upxAj8C1an%1g=g-{HKyGW(XF(MG?FP|xTtCe?3`CyyN9q2VVP!EQ?lcyYia zOAd0Lo2t9+JGe((?f_%5ykO(|i6^Er{PX^F;W@8?=qImC@sB2lOi)gYTEGpTvB}kmKU3? zwtXywDt{?jOy#X>zW=trp2I?_PJbz~J9%^5 z)fRn%4|GCN4;Q{CL7k5{X#n^A*qq-} zb*Y3LLTBTsj+b88O7F_z=cmzpYKAigez^2xyMBzYy)a@d%T9 zhtgYhQuCj_wGVkxnYfe9Z#<(tHD=BT|Cm=~VOdoCp(DPQH1Q1`Sf?(Xb)<{CUD{88 zeupDK_VtCD_RETvh{kfJS_ggSY5O0p(?1SsM+GP#?YYD#t(HrWki>p??-+@yk&Nh( zCxUM>Tr)G=(4Ku_lkPHtUT1l2&Y+!iwO$6U$km>v!423yYHy!Vk{RAmB}T+L>l{c~l$G_vmFC)Zfbun%n&P$c zptcYyFJQ>HQn@7qnigp2Cv-|e^C-Yy3f{cAlYpAcW}qeo zuYl?t#mzmtWs`sc<>QlAH}h=U;={RaOFaS_4PZFWH>Os;2*w5Kpgn(j8MJ&|Z+*6r_xKtv8YYma>_cysF;s53HR=N+D_FE@uo zByQ;$!3*GLSM1z`P%59#`i)K=b+irI{dL$upSVs@rGCsGDOMvFULb-?GLh_mG;8hY z!~9_IgXD|7*m`fNUuM(^KnfF=7>l2*CEXoZr(Ty(l6cO8M@WF9pPsbJ;!>P#>6_C$bn6qz_%~Bbu1=|eY zRXL$lK_9y>C}~7fM_>J0TuRfE6~k+wqzEqI^TEEl?H-J&%S)&NdKem5lo(}AUts=7 zqcwJ%SAQ)x7mT~;)CQAmDrj3MP)I7JpiQ()#QONcp!ru-#b#k5k!(~Hq#r%A-eQ6O z{;tNu54Ki%r0;Ui;9AC^y(aP%m2ByomMhV|E#dbMxJ$jRDY=w z6i(oIQXs0{j2+yV?Utkzdb<00lx)i8NcJA11cIJ&dCMd0SmFtY z(4+fkOI4m6{1zTTZ4+3ZEPR;hb_~`s1;w^_ZA0{4G{HFlu(an09;5D*;zmOssX;}- zmCzN>$@TDm2jS*`x`{_ou*ASXYklH_#Raqs`HdLOZMXyQ{6kRoK$ule?2n2bc1561 z2HB_OE>9RWJ-OkefE$X8iXt?8*T2{4Zn*ID3Zb|_;@!0N{rkj;{{Z-`ZE6||5=h^I zW$ALL67H#kLnjF2Bs9lCWzvB^$j{3oHWifOds&H~hXc7n8~)%b5=jJf1_Td=vU14O z6CFm&jwiU@zP>Jy!cgHbDFUpdgaiaaZXxKsj0wc#Q5B6Ev}Ui;=2Y&8K;~Z-G5Y1) zCtH6b98PNQD1lE_LPa6}J)m5ca&L~)l>WA2!l5U1*FG@;yh|BAvaDE>b;f?_CdIv` z(h}BV>Hog)tk=}gDgX`dP|{<26GTvPy(cJ8bInzGI0dq1zV(}Pgw7Ck@HU@l zO%NZ_x29KD`En_&mHE@piSN6&0^|+$m_=-?GTZW8o`@P}-m}Dsde?7R&zy%E9Wc08 z7^WFT-`_ze-PD~Od#!b1Gh&hHCQbzHCcsKZ@U6yQNRn<|y#`h=?UeUq{_oC-iZc6h zj|;osSSsIz|FJ6LeNaf%Fc8!7>4C-bF7%b_9x>6%iSvngDwv0jqTym^N;4( {output.o1} &> {log}" \ No newline at end of file + """ + kraken2 --use-names --threads 4 --db {params.p1} --report {output.o2} {input.i1} > {output.o1} &> {log} + touch {output.o3} + """ \ No newline at end of file diff --git a/scripts/synapse_id.tsv b/scripts/synapse_id.tsv new file mode 100644 index 0000000..e8e772a --- /dev/null +++ b/scripts/synapse_id.tsv @@ -0,0 +1,196 @@ +Directory ParentID Filename EntityID +Data/Gene Expression/Gene Expression scRNA seq/counts - CITEseq syn26560200 S33.zip syn24610356 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S1.zip syn24610321 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S10.zip syn24610331 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S11.zip syn24610333 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S12.zip syn24610334 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S13.zip syn24610335 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S14.zip syn24610337 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S15.zip syn24610338 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S16.zip syn24610339 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S17.zip syn24610340 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S18.zip syn24610341 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S19.zip syn24610342 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S2.zip syn24610322 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S20.zip syn24610343 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S21.zip syn24610344 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S22.zip syn24610345 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S23.zip syn24610346 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S24.zip syn24610347 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S25.zip syn24610348 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S26.zip syn24610349 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S27.zip syn24610350 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S28.zip syn24610351 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S29.zip syn24610352 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S3.zip syn24610323 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S30.zip syn24610353 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S31.zip syn24610354 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S32.zip syn24610355 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S4.zip syn24610324 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S5.zip syn24610325 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S6.zip syn24610327 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S7.zip syn24610328 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S8.zip syn24610329 +Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S9.zip syn24610330 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_I1_001.fastq.gz syn26534563 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_R1_001.fastq.gz syn26534582 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_R2_001.fastq.gz syn26534579 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_I1_001.fastq.gz syn26534567 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_R1_001.fastq.gz syn26534585 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_R2_001.fastq.gz syn26534584 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_I1_001.fastq.gz syn26534562 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_R1_001.fastq.gz syn26534575 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_R2_001.fastq.gz syn26534574 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_I1_001.fastq.gz syn26534565 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_R1_001.fastq.gz syn26534580 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_R2_001.fastq.gz syn26534576 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_I1_001.fastq.gz syn26534564 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_R1_001.fastq.gz syn26534578 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_R2_001.fastq.gz syn26534577 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_I1_001.fastq.gz syn26534566 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_R1_001.fastq.gz syn26534573 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_R2_001.fastq.gz syn26534572 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_I1_001.fastq.gz syn26534419 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_R1_001.fastq.gz syn26534428 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_R2_001.fastq.gz syn26534444 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_I1_001.fastq.gz syn26534424 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_R1_001.fastq.gz syn26534431 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_R2_001.fastq.gz syn26534446 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_I1_001.fastq.gz syn26534420 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_R1_001.fastq.gz syn26534427 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_R2_001.fastq.gz syn26534441 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_I1_001.fastq.gz syn26534426 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_R1_001.fastq.gz syn26534430 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_R2_001.fastq.gz syn26534445 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_I1_001.fastq.gz syn26534425 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_R1_001.fastq.gz syn26534429 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_R2_001.fastq.gz syn26534447 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_I1_001.fastq.gz syn26534423 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_R1_001.fastq.gz syn26534442 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_R2_001.fastq.gz syn26534458 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_I1_001.fastq.gz syn26534422 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_R1_001.fastq.gz syn26534440 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_R2_001.fastq.gz syn26534457 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_I1_001.fastq.gz syn26534421 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_R1_001.fastq.gz syn26534437 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_R2_001.fastq.gz syn26534456 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_I1_001.fastq.gz syn26534432 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_R1_001.fastq.gz syn26534439 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_R2_001.fastq.gz syn26534453 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_I1_001.fastq.gz syn26534433 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_R1_001.fastq.gz syn26534452 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_R2_001.fastq.gz syn26534462 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_I1_001.fastq.gz syn26534434 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_R1_001.fastq.gz syn26534450 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_R2_001.fastq.gz syn26534459 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_I1_001.fastq.gz syn26534435 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_R1_001.fastq.gz syn26534451 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_R2_001.fastq.gz syn26534465 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_I1_001.fastq.gz syn26534436 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_R1_001.fastq.gz syn26534460 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_R2_001.fastq.gz syn26534464 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_I1_001.fastq.gz syn26534438 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_R1_001.fastq.gz syn26534448 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_R2_001.fastq.gz syn26534463 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_I1_001.fastq.gz syn26534454 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_R1_001.fastq.gz syn26534449 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_R2_001.fastq.gz syn26534455 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_I1_001.fastq.gz syn26534466 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_R1_001.fastq.gz syn26534461 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_R2_001.fastq.gz syn26534485 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_I1_001.fastq.gz syn26534467 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_R1_001.fastq.gz syn26534474 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_R2_001.fastq.gz syn26534484 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_I1_001.fastq.gz syn26534468 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_R1_001.fastq.gz syn26534472 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_R2_001.fastq.gz syn26534483 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_I1_001.fastq.gz syn26534469 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_R1_001.fastq.gz syn26534471 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_R2_001.fastq.gz syn26534481 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_I1_001.fastq.gz syn26534475 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_R1_001.fastq.gz syn26534470 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_R2_001.fastq.gz syn26534487 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_I1_001.fastq.gz syn26534488 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_R1_001.fastq.gz syn26534482 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_R2_001.fastq.gz syn26534486 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_I1_001.fastq.gz syn26534489 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_R1_001.fastq.gz syn26534497 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_R2_001.fastq.gz syn26534507 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_I1_001.fastq.gz syn26534491 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_R1_001.fastq.gz syn26534499 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_R2_001.fastq.gz syn26534508 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_I1_001.fastq.gz syn26534490 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_R1_001.fastq.gz syn26534495 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_R2_001.fastq.gz syn26534506 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_I1_001.fastq.gz syn26534570 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_R1_001.fastq.gz syn26534581 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_R2_001.fastq.gz syn26534586 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_I1_001.fastq.gz syn26534571 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_R1_001.fastq.gz syn26534583 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_R2_001.fastq.gz syn26534587 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_I1_001.fastq.gz syn26534494 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_R1_001.fastq.gz syn26534502 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_R2_001.fastq.gz syn26534509 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_I1_001.fastq.gz syn26534518 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_R1_001.fastq.gz syn26534540 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_R2_001.fastq.gz syn26534537 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_I1_001.fastq.gz syn26534519 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_R1_001.fastq.gz syn26534538 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_R2_001.fastq.gz syn26534536 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_I1_001.fastq.gz syn26534521 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_R1_001.fastq.gz syn26534544 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_R2_001.fastq.gz syn26534542 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_I1_001.fastq.gz syn26534523 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_R1_001.fastq.gz syn26534546 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_R2_001.fastq.gz syn26534545 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_I1_001.fastq.gz syn26534492 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_R1_001.fastq.gz syn26534501 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_R2_001.fastq.gz syn26534504 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_I1_001.fastq.gz syn26534514 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_R1_001.fastq.gz syn26534500 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_R2_001.fastq.gz syn26534512 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_I1_001.fastq.gz syn26534510 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_R1_001.fastq.gz syn26534503 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_R2_001.fastq.gz syn26534505 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_I1_001.fastq.gz syn26534493 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_R1_001.fastq.gz syn26534511 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_R2_001.fastq.gz syn26534530 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_I1_001.fastq.gz syn26534516 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_R1_001.fastq.gz syn26534534 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_R2_001.fastq.gz syn26534529 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_I1_001.fastq.gz syn26534513 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_R1_001.fastq.gz syn26534533 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_R2_001.fastq.gz syn26534531 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_I1_001.fastq.gz syn26534515 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_R1_001.fastq.gz syn26534522 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_R2_001.fastq.gz syn26534520 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_I1_001.fastq.gz syn26534517 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_R1_001.fastq.gz syn26534535 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_R2_001.fastq.gz syn26534532 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_I1_001.fastq.gz syn26534524 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_R1_001.fastq.gz syn26534551 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_R2_001.fastq.gz syn26534548 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_I1_001.fastq.gz syn26534525 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_R1_001.fastq.gz syn26534549 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_R2_001.fastq.gz syn26534547 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_I1_001.fastq.gz syn26534527 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_R1_001.fastq.gz syn26534556 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_R2_001.fastq.gz syn26534555 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_I1_001.fastq.gz syn26534528 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_R1_001.fastq.gz syn26534553 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_R2_001.fastq.gz syn26534550 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_I1_001.fastq.gz syn26534526 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_R1_001.fastq.gz syn26534541 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_R2_001.fastq.gz syn26534539 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_I1_001.fastq.gz syn26534543 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_R1_001.fastq.gz syn26534552 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_R2_001.fastq.gz syn26534558 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_I1_001.fastq.gz syn26534557 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_R1_001.fastq.gz syn26534559 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_R2_001.fastq.gz syn26534561 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_I1_001.fastq.gz syn26534560 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_R1_001.fastq.gz syn26534569 +Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_R2_001.fastq.gz syn26534568 +Data/Metadata syn24168324 HBI_scRNAseq_assay_scrnaSeq_metadata.csv syn24610436 +Data/Metadata syn24168324 HBI_scRNAseq_biospecimen_metadata.csv syn24610438 +Data/Metadata syn24168324 HBI_scRNAseq_individual_metadata.csv syn24610550 From 0746ed40a85b5d5c6be181f25c38ffd9f1984afb Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:28:26 +0200 Subject: [PATCH 10/33] Added Flake workflow --- .github/workflows/flake.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/flake.yml diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml new file mode 100644 index 0000000..f490b2f --- /dev/null +++ b/.github/workflows/flake.yml @@ -0,0 +1,25 @@ +name: Flake Check + +on: + push: + branches: + - dev + +jobs: + flake-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.x + + - name: Install Snakemake + run: pip install snakemake + + - name: Dry run Snakemake + run: snakemake -cores 32 --use-conda --configfile config.yaml --dry-run From 9ccd0a42e14a65bbd5a73f8f66a232948109c86f Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:34:15 +0200 Subject: [PATCH 11/33] Updated flake.yml --- .github/workflows/flake.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index f490b2f..25a4ca4 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -13,7 +13,12 @@ jobs: - name: Checkout code uses: actions/checkout@v2 - - name: Setup Python + - name: Setup Node.js + uses: actions/setup-node@v2 + with: + node-version: 16 + + - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.x From 67afd96e82eca3fa9b18d05f228d71c658275629 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:35:28 +0200 Subject: [PATCH 12/33] Updated flake.yml --- .github/workflows/flake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index 25a4ca4..d5b75f8 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -27,4 +27,4 @@ jobs: run: pip install snakemake - name: Dry run Snakemake - run: snakemake -cores 32 --use-conda --configfile config.yaml --dry-run + run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run From 37a0d06ad4cc461c3a84bb9d32e38583ea804d20 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:37:35 +0200 Subject: [PATCH 13/33] Updated flake.yml --- .github/workflows/flake.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index d5b75f8..a823983 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -26,5 +26,8 @@ jobs: - name: Install Snakemake run: pip install snakemake + - name: Installing Pandas + run: pip install pandas + - name: Dry run Snakemake run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run From 90aa6def6b2039b6461471d5ea38ce9d96d66dbb Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:41:07 +0200 Subject: [PATCH 14/33] Updated flake.yml --- .github/workflows/flake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index a823983..cbc9ac0 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -30,4 +30,4 @@ jobs: run: pip install pandas - name: Dry run Snakemake - run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run + run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run --conda-frontend conda From 50e9391011eecb29826cde46793cc5be1a1b5c68 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 15:47:10 +0200 Subject: [PATCH 15/33] Added linting --- .github/workflows/flake.yml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index cbc9ac0..d7ac2e8 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -13,11 +13,6 @@ jobs: - name: Checkout code uses: actions/checkout@v2 - - name: Setup Node.js - uses: actions/setup-node@v2 - with: - node-version: 16 - - name: Set up Python uses: actions/setup-python@v2 with: @@ -31,3 +26,15 @@ jobs: - name: Dry run Snakemake run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run --conda-frontend conda + + + Linting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Lint workflow + uses: snakemake/snakemake-github-action@v1.24.0 + with: + directory: . + snakefile: Snakefile + args: "--lint" \ No newline at end of file From 5845a9d179ed8770e2f61367232cec98114a0454 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 15 May 2023 18:03:22 +0200 Subject: [PATCH 16/33] Pipeline Cleanup --- READMD.md | 32 ++++++++++++++++++++++++++++++ Snakefile | 23 +++++++++------------ config.yaml | 5 +++-- env.yaml | 12 +++++++++++ rules/cellranger.smk | 15 ++++++++------ rules/download_samples_or_copy.smk | 22 ++++++++++---------- rules/extract_bam.smk | 12 ++++++----- rules/extract_tags.smk | 11 ++++++---- rules/kraken2_mapping.smk | 17 ++++++++-------- scripts/count_matrix_processing.py | 29 +++++++++++++++++++++++++++ 10 files changed, 128 insertions(+), 50 deletions(-) create mode 100644 READMD.md create mode 100644 env.yaml create mode 100644 scripts/count_matrix_processing.py diff --git a/READMD.md b/READMD.md new file mode 100644 index 0000000..15d8dfb --- /dev/null +++ b/READMD.md @@ -0,0 +1,32 @@ +# sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. + +[![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) +[![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Tests/badge.svg?branch=main)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) + + +A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell +RNA (scRNA) datasets to investigate the possible correlation between viral pathogens and neurodegenerative diseases. + +
+ +# DAG for the pipeline +![Graphviz Diagram](dag.png) + + + +
+ +## Usage + +> ***snakemake --cores 16 --use-conda --configfile config.yaml --latency-wait 60*** + +### Note +This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. + +
+ +## Contributing +The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=maxplanck-ie%2Fsc-virome-scan). + +If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). + diff --git a/Snakefile b/Snakefile index d5452fe..46da13b 100644 --- a/Snakefile +++ b/Snakefile @@ -2,17 +2,11 @@ import glob import os import pandas as pd -#configfile = "/data/manke/processing/momin/virome-scan/workflow/config.yaml", -data_dir = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data" - +configfile: "config.yaml" accession_list = pd.read_table("SRA.tsv") samples = list(accession_list.Samples.unique()) -""" -samples, = glob_wildcards("/data/manke/processing/momin/virome-scan/sc-virome-scan/data/{sample}_L001_R1.fastq.gz") -print(samples) -""" include: "rules/download_samples_or_copy.smk" include: "rules/kraken2_mapping.smk" @@ -22,13 +16,14 @@ include: "rules/extract_tags.smk" rule all: input: - #expand("data/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), - #expand("data/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), - #expand("data/{sample}_test.txt",sample=samples), - #expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), - #expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples), - #expand("results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam",sample=samples), - #expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), + expand("data/{sample}/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), + expand("data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), + expand("data/{sample}_test.txt",sample=samples), + expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), + expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples), + expand("results/kraken2/{sample}/{sample}_classified_out.fastq",sample=samples), + expand("results/cellranger/{sample}/{sample}_finished.txt",sample=samples), + expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) onsuccess: diff --git a/config.yaml b/config.yaml index e481375..7b13f6f 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,5 @@ -"files" : "proxy" +"files" : None "chemistry" : "SC3Pv2" "transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" -"dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_old/" \ No newline at end of file +"local_files_dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_pos_control/SRR13114612" +"kraken_db": "/data/repository/kraken2_contaminome/virus_db2023" \ No newline at end of file diff --git a/env.yaml b/env.yaml new file mode 100644 index 0000000..ba93358 --- /dev/null +++ b/env.yaml @@ -0,0 +1,12 @@ +name: sc-viromescan +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - kraken2 + - samtools + - entrez-direct + - parallel-fastq-dump + - pandas + - snakemake \ No newline at end of file diff --git a/rules/cellranger.smk b/rules/cellranger.smk index 65a3610..60223e8 100644 --- a/rules/cellranger.smk +++ b/rules/cellranger.smk @@ -2,18 +2,21 @@ rule cellranger: input: i1 = expand("data/{sample}_test.txt", sample=samples) output: - o1 = "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam", - o2 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" + o1 = temp("results/cellranger/{sample}/{sample}_finished.txt") priority: 90 + resources: + mem_mb = 26000 + threads: 30 log: - "results/cellranger/{sample}/{sample}_cellranger.log" + "results/cellranger/{sample}/{sample}.cellranger.log" params: p1 = config["chemistry"], p2 = config["transcriptome"], - p3 = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data/", - p4 = "/data/manke/processing/momin/virome-scan/kraken2/negative_ctrl/data/" + p3 = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data/{sample}/" shell: """ + module load cellranger + touch {output.o1} cd results/cellranger/{wildcards.sample}/ - cellranger count --id {wildcards.sample} --fastqs {params.p4} --transcriptome {config[transcriptome]} --chemistry SC3Pv2 + cellranger count --id {wildcards.sample} --fastqs {params.p3} --transcriptome {config[transcriptome]} --localcores {threads} --chemistry {config[chemistry]} """ diff --git a/rules/download_samples_or_copy.smk b/rules/download_samples_or_copy.smk index d7ae67b..57a0921 100644 --- a/rules/download_samples_or_copy.smk +++ b/rules/download_samples_or_copy.smk @@ -1,22 +1,22 @@ rule download_samples_or_copy: output: - o1 = temp("data/{sample}_S1_L001_R1_001.fastq.gz"), - o2 = temp("data/{sample}_S1_L001_R2_001.fastq.gz") + o1 = "data/{sample}/{sample}_S1_L001_R1_001.fastq.gz", + o2 = "data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", params: - outdir = "data", - data_directory = config["dir"] + outdir = "data/{sample}/" threads: 16 + log: + "results/logs/download_samples_or_copy/{sample}.log" priority: 100 - conda: - "envs/tools.yaml" shell: """ if [ {config[files]} == 'local' ]; then - cp {config[dir]}/*.fastq.gz data/ - + for file in {wildcards.sample}*; do + ln -s {config[local_files_dir]}/$file data/$file + done else parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip --tmpdir /data/manke/processing/momin/virome-scan/sc-virome-scan/tmp - mv data/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}_S1_L001_R1_001.fastq.gz - mv data/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}_S1_L001_R2_001.fastq.gz + mv data/{wildcards.sample}/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz + mv data/{wildcards.sample}/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz fi - """ \ No newline at end of file + """ diff --git a/rules/extract_bam.smk b/rules/extract_bam.smk index ad0a901..b81b174 100644 --- a/rules/extract_bam.smk +++ b/rules/extract_bam.smk @@ -1,9 +1,11 @@ rule extract_bam: input: - "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam" + i1 = "results/cellranger/{sample}/{sample}_finished.txt" output: - "results/cellranger/{sample}/unmapped_reads.sam" - conda: - "envs/samtools.yaml" + temp("results/cellranger/{sample}/unmapped_reads.sam") + log: + "results/samtools/{sample}.log" + params: + "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam" shell: - "samtools view -f 4 {input} > {output}" \ No newline at end of file + "samtools view -@ 16 -f 4 {params} > {output}" \ No newline at end of file diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index 2108a7a..514aedf 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -1,13 +1,16 @@ rule extract_tags: input: i1 = "results/cellranger/{sample}/unmapped_reads.sam", - i2 = "results/kraken2/{sample}/{sample}.kraken", - i3 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" + i2 = "results/kraken2/{sample}/{sample}.kraken" output: "results/count_matrix/{sample}/count_matrix.tsv" + threads: 16 + resources: + mem_mb = 40000 params: - p1 = "results/count_matrix/{sample}/" + p1 = "results/count_matrix/{sample}/", + p2 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" log: "results/count_matrix/{sample}_bam_extract.log" shell: - "python3 scripts/bam_extract.py -i {input.i1} -k {input.i2} -b {input.i3} -o {params.p1}" \ No newline at end of file + "python3 scripts/bam_extract.py -i {input.i1} -k {input.i2} -b {params.p2} -o {params.p1}" \ No newline at end of file diff --git a/rules/kraken2_mapping.smk b/rules/kraken2_mapping.smk index 55eccdd..92dd1f2 100644 --- a/rules/kraken2_mapping.smk +++ b/rules/kraken2_mapping.smk @@ -1,19 +1,20 @@ rule kraken2_mapping: input: - i1 = "data/{sample}_S1_L001_R2_001.fastq.gz" + i1 = "data/{sample}/{sample}_S1_L001_R2_001.fastq.gz" output: o1 = "results/kraken2/{sample}/{sample}.kraken", o2 = "results/kraken2/{sample}/{sample}.report.txt", - o3 = temp("data/{sample}_test.txt") - conda: - "envs/kraken2.yaml" + o3 = "results/kraken2/{sample}/{sample}_classified_out.fastq", + o4 = temp("data/{sample}_test.txt") priority: 95 + threads: 16 params: - p1 = "/data/repository/kraken2_contaminome/virus_db" + p1 = config["kraken_db"] log: - "results/logs/kraken2/{sample}_kraken.log" + "results/logs/kraken2/{sample}.kraken.log" shell: """ - kraken2 --use-names --threads 4 --db {params.p1} --report {output.o2} {input.i1} > {output.o1} &> {log} - touch {output.o3} + kraken2 --use-names --threads {threads} --db {params.p1} --report {output.o2} --output {output.o1} --classified-out {output.o3} {input.i1} 2> {log} + gzip {output.o3} + touch {output.o4} """ \ No newline at end of file diff --git a/scripts/count_matrix_processing.py b/scripts/count_matrix_processing.py new file mode 100644 index 0000000..20658a0 --- /dev/null +++ b/scripts/count_matrix_processing.py @@ -0,0 +1,29 @@ +import os + +directory_path = '/data/manke/processing/momin/virome-scan/DGE/EBV/count_matrix' + +tsv_files = [] +for root, dirs, files in os.walk(directory_path): + for file in files: + if file.endswith('.tsv'): + tsv_files.append(os.path.join(root, file)) + +taxon_df = pd.DataFrame(columns=['Tax_ID']) +for file in tsv_files: + df = pd.read_csv(file, sep='\t') + taxon = df['Tax_ID'] + taxon_df = pd.merge(taxon_df,taxon,on='Tax_ID', how='outer') + + +for file in tsv_files: + df2 = pd.read_csv(file, sep='\t') + samplename = str(file.split('/')[-2]) + summed_data = df2.groupby('Tax_ID').sum().iloc[:, :].sum(axis=1).to_frame(samplename) + taxon_df = pd.merge(taxon_df, summed_data, on='Tax_ID', how='left') + +taxon_df = taxon_df.fillna(0) +taxon_df['Tax_ID'] = taxon_df['Tax_ID'].str.split('(').str[0].str.strip() +taxon_df.iloc[:, 1:] = taxon_df.iloc[:, 1:].astype(int) +taxon_df = taxon_df.rename(columns={'Tax_ID': 'Taxon'}) +taxon_df.to_csv("/home/momin/count_matrix_summarized.tsv", sep='\t', index=False) +taxon_df \ No newline at end of file From 97a5ca41c83f6bf4aa15ef26541b4b61e3437d9e Mon Sep 17 00:00:00 2001 From: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> Date: Mon, 15 May 2023 18:11:43 +0200 Subject: [PATCH 17/33] Update flake.yml --- .github/workflows/flake.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/flake.yml b/.github/workflows/flake.yml index d7ac2e8..945f777 100644 --- a/.github/workflows/flake.yml +++ b/.github/workflows/flake.yml @@ -26,15 +26,3 @@ jobs: - name: Dry run Snakemake run: snakemake --cores 32 --use-conda --configfile config.yaml --dry-run --conda-frontend conda - - - Linting: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Lint workflow - uses: snakemake/snakemake-github-action@v1.24.0 - with: - directory: . - snakefile: Snakefile - args: "--lint" \ No newline at end of file From 8c9be2c52de9729896a42039478316a5d99d1416 Mon Sep 17 00:00:00 2001 From: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> Date: Mon, 15 May 2023 21:28:00 +0200 Subject: [PATCH 18/33] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 15d8dfb..d77a4c7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) -[![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Tests/badge.svg?branch=main)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) +[![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Flake Check/badge.svg?branch=dev)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell From 884e85160f35ec9cb856205b873af471ff1f90b0 Mon Sep 17 00:00:00 2001 From: Saim Date: Thu, 25 May 2023 17:15:38 +0200 Subject: [PATCH 19/33] Code Cleanup --- config.yaml | 1 - rules/cellranger.smk | 2 +- rules/envs/kraken2.yaml | 6 ------ rules/envs/samtools.yaml | 6 ------ rules/envs/tools.yaml | 9 --------- 5 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 rules/envs/kraken2.yaml delete mode 100644 rules/envs/samtools.yaml delete mode 100644 rules/envs/tools.yaml diff --git a/config.yaml b/config.yaml index 7b13f6f..239b286 100644 --- a/config.yaml +++ b/config.yaml @@ -1,5 +1,4 @@ "files" : None -"chemistry" : "SC3Pv2" "transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" "local_files_dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_pos_control/SRR13114612" "kraken_db": "/data/repository/kraken2_contaminome/virus_db2023" \ No newline at end of file diff --git a/rules/cellranger.smk b/rules/cellranger.smk index 60223e8..0a79baf 100644 --- a/rules/cellranger.smk +++ b/rules/cellranger.smk @@ -18,5 +18,5 @@ rule cellranger: module load cellranger touch {output.o1} cd results/cellranger/{wildcards.sample}/ - cellranger count --id {wildcards.sample} --fastqs {params.p3} --transcriptome {config[transcriptome]} --localcores {threads} --chemistry {config[chemistry]} + cellranger count --id {wildcards.sample} --fastqs {params.p3} --transcriptome {config[transcriptome]} --localcores {threads} """ diff --git a/rules/envs/kraken2.yaml b/rules/envs/kraken2.yaml deleted file mode 100644 index 516686a..0000000 --- a/rules/envs/kraken2.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - kraken2 \ No newline at end of file diff --git a/rules/envs/samtools.yaml b/rules/envs/samtools.yaml deleted file mode 100644 index 2c1c845..0000000 --- a/rules/envs/samtools.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - samtools \ No newline at end of file diff --git a/rules/envs/tools.yaml b/rules/envs/tools.yaml deleted file mode 100644 index 842fc5a..0000000 --- a/rules/envs/tools.yaml +++ /dev/null @@ -1,9 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - entrez-direct - - parallel-fastq-dump - - umi_tools - - bowtie2 \ No newline at end of file From 5eb03599d09e2f7fb9e99d598f5ff4f9e509fafe Mon Sep 17 00:00:00 2001 From: Saim Date: Thu, 25 May 2023 17:22:23 +0200 Subject: [PATCH 20/33] Code Cleanup --- LICENSE | 21 --------------------- READMD.md | 32 -------------------------------- envs/kraken2.yaml | 6 ------ envs/samtools.yaml | 6 ------ envs/tools.yaml | 9 --------- 5 files changed, 74 deletions(-) delete mode 100644 LICENSE delete mode 100644 READMD.md delete mode 100644 envs/kraken2.yaml delete mode 100644 envs/samtools.yaml delete mode 100644 envs/tools.yaml diff --git a/LICENSE b/LICENSE deleted file mode 100644 index e9b75a4..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2021, AUTHORS - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/READMD.md b/READMD.md deleted file mode 100644 index 15d8dfb..0000000 --- a/READMD.md +++ /dev/null @@ -1,32 +0,0 @@ -# sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. - -[![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) -[![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Tests/badge.svg?branch=main)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) - - -A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell -RNA (scRNA) datasets to investigate the possible correlation between viral pathogens and neurodegenerative diseases. - -
- -# DAG for the pipeline -![Graphviz Diagram](dag.png) - - - -
- -## Usage - -> ***snakemake --cores 16 --use-conda --configfile config.yaml --latency-wait 60*** - -### Note -This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. - -
- -## Contributing -The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=maxplanck-ie%2Fsc-virome-scan). - -If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). - diff --git a/envs/kraken2.yaml b/envs/kraken2.yaml deleted file mode 100644 index 516686a..0000000 --- a/envs/kraken2.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - kraken2 \ No newline at end of file diff --git a/envs/samtools.yaml b/envs/samtools.yaml deleted file mode 100644 index 2c1c845..0000000 --- a/envs/samtools.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - samtools \ No newline at end of file diff --git a/envs/tools.yaml b/envs/tools.yaml deleted file mode 100644 index 842fc5a..0000000 --- a/envs/tools.yaml +++ /dev/null @@ -1,9 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - entrez-direct - - parallel-fastq-dump - - umi_tools - - bowtie2 \ No newline at end of file From 965aef8c93b200cefa83d1f8a5bee543de8ac086 Mon Sep 17 00:00:00 2001 From: Saim Date: Thu, 25 May 2023 17:25:23 +0200 Subject: [PATCH 21/33] Updated Cellranger rule --- rules/cellranger.smk | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rules/cellranger.smk b/rules/cellranger.smk index 0a79baf..1f6cefa 100644 --- a/rules/cellranger.smk +++ b/rules/cellranger.smk @@ -9,9 +9,7 @@ rule cellranger: threads: 30 log: "results/cellranger/{sample}/{sample}.cellranger.log" - params: - p1 = config["chemistry"], - p2 = config["transcriptome"], + params: p3 = "/data/manke/processing/momin/virome-scan/sc-virome-scan/data/{sample}/" shell: """ From 299eaf39185f65ce8443b56a5c623a90cb618fd2 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 10 Jul 2023 16:53:48 +0200 Subject: [PATCH 22/33] Minor Changes to Script --- scripts/bam_extract.py | 78 ++++++++++++--------- scripts/kraken_build.py | 94 +++++++++++++++++++++++++ scripts/kraken_plot.py | 144 ++++++++++++++++++++++++--------------- scripts/synapse_fetch.py | 78 ++++++++++++--------- 4 files changed, 276 insertions(+), 118 deletions(-) create mode 100644 scripts/kraken_build.py diff --git a/scripts/bam_extract.py b/scripts/bam_extract.py index ba6e6ff..f3b8dde 100644 --- a/scripts/bam_extract.py +++ b/scripts/bam_extract.py @@ -2,57 +2,71 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument("-i", "--input_file", metavar="PATH", help="Path to your SAM file") -parser.add_argument("-k", "--kraken_input_file", metavar="FILE", help="Path of your .kraken file from Kraken output") -parser.add_argument("-b", "--barcode_file", metavar="FILE", help="Filtered Barcodes tsv file generated from Cell Ranger") -parser.add_argument("-o", "--output-file", metavar="PATH", help="Path to your output file") +parser.add_argument("-i", "--input_file", + metavar="PATH", + help="Path to your SAM file") +parser.add_argument("-k", "--kraken_input_file", + metavar="FILE", + help="Path of your .kraken file from Kraken output") +parser.add_argument("-b", "--barcode_file", + metavar="FILE", + help="Filtered Barcodes TSV file from CellRanger") +parser.add_argument("-o", "--output-file", + metavar="PATH", + help="Path to your output file") args = parser.parse_args() file = args.input_file -tags_dict = {} +tags_dict = {} -#TODO: Check if the Kraken file is empty +# TODO: Check if the Kraken file is empty -#Parsing SAM file and storing ReadName along with its TAGS in dictionary +# Parsing SAM file and storing ReadName along with its TAGS in dictionary with open(file) as f: for line in f: - if line.startswith("@"): #Skipping header + if line.startswith("@"): continue - fields = line.strip().split("\t") - read_name = fields[0] - entries = fields[11:] - + fields = line.strip().split("\t") + read_name = fields[0] + entries = fields[11:] + for tag in entries: tag_fields = tag.split(":") tag_name = tag_fields[0] tag_value = tag_fields[2] - if tag_name in ["CR", "CB", "UR", "UB"]: + if tag_name in ["CR", "CB", "UR", "UB"]: if read_name in tags_dict: - tags_dict[read_name][tag_name] = tag_value #Adding tag to existing existing dictionary for the read name + # Adding tag to existing dictionary for the read name + tags_dict[read_name][tag_name] = tag_value else: - tags_dict[read_name] = {tag_name: tag_value} #Else creating new dictionary and adding rag to it -#Converting them to Pandas Dataframe -df = pd.DataFrame.from_dict(tags_dict, orient='index') -df.index.name = "Read Name" + # Else creating new dictionary and adding tag to it + tags_dict[read_name] = {tag_name: tag_value} + +# Converting them to Pandas Dataframe +df = pd.DataFrame.from_dict(tags_dict, orient='index') +df.index.name = "Read Name" df.reset_index(inplace=True) -df1 = df.drop_duplicates(subset=['CR','UR'],keep = 'last').reset_index(drop = True) -print(df1.head()) +df1 = df.drop_duplicates(subset=['CR', 'UR'], + keep='last').reset_index(drop=True) -#Reading Kraken Output and merging with the previous Dataframe +# Reading Kraken Output and merging with the previous Dataframe columns_name = ['status', 'Read Name', 'Tax_ID', 'length', 'LCA_mapping'] -df2 = pd.read_csv(args.kraken_input_file,sep='\t',names=columns_name, index_col=None) -merged = pd.merge(df1,df2,on='Read Name', how='inner') -merged_subset = merged.loc[:, ['Read Name', 'Tax_ID', 'CR', 'CB','UR', 'UB']] -print(merged_subset.head()) +df2 = pd.read_csv(args.kraken_input_file, sep='\t', + names=columns_name, index_col=None) +merged = pd.merge(df1, df2, on='Read Name', how='inner') +merged_subset = merged.loc[:, ['Read Name', 'Tax_ID', 'CR', 'CB', 'UR', 'UB']] -#Filtering dataframe based on barcodes reported by CellRanger -barcodes = pd.read_csv(args.barcode_file, compression='gzip',sep='\t', names=['CR']) -barcodes.CR = [x.strip().replace('-1', '') for x in barcodes.CR] -merged_barcodes = pd.merge(merged_subset,barcodes,on='CR', how='inner') -print(merged_barcodes.head()) +# Filtering dataframe based on barcodes reported by CellRanger +barcodes = pd.read_csv(args.barcode_file, + compression='gzip', sep='\t', names=['CR']) +barcodes.CR = [x.strip().replace('-1', '') for x in barcodes.CR] +merged_barcodes = pd.merge(merged_subset, barcodes, + on='CR', how='inner') -#Creating a count matrix and writing it to a file -result = merged_barcodes.pivot_table(index='Tax_ID', columns='CR', values='Read Name', aggfunc='count').fillna(0.).astype(int) +# Creating a count matrix and writing it to a file +result = merged_barcodes.pivot_table(index='Tax_ID', columns='CR', + values='Read Name', + aggfunc='count').fillna(0.).astype(int) result.to_csv(args.output_file + "count_matrix.tsv", sep='\t') diff --git a/scripts/kraken_build.py b/scripts/kraken_build.py new file mode 100644 index 0000000..5975628 --- /dev/null +++ b/scripts/kraken_build.py @@ -0,0 +1,94 @@ +import pandas as pd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input_families_list", + metavar="PATH", + help="Path to input txt file consisting of Families Names") + +parser.add_argument("-names", "--input_names_dmp", + metavar="PATH", + help="Path to input names.dmp file") + +parser.add_argument("-acc", "--input_refseq_acc", + metavar="PATH", + help="Path to input RefSeq AccesionID file") + +parser.add_argument("-ngb", "--input_nucl_gb", + metavar="PATH", + help="Path to input nucl_gb.accession2taxid file") + +parser.add_argument("-nodes", "--input_nodes_dmp", + metavar="PATH", + help="Path to input nodes.dmp file") + +parser.add_argument("-out", "--output_acc_path", + metavar="PATH", + help="Path to save filtered accession list") + +args = parser.parse_args() + +# Step 1: Reading the Families text file and extracting corresponding TaxIDs from names.dmp file +with open(args.input_families_list, 'r') as f: + families = list(f.read().splitlines()) + +fam2taxid = {} +with open(args.input_names_dmp, "r") as file: + for line in file: + data = line.split("|") + taxid = data[0].strip() + taxname = data[1].strip() + if taxname in families: + fam2taxid[taxname] = int(taxid) +fam_list = [i for i in fam2taxid.values()] + +# Step 2: Retreiving the NCBI Accession IDs for each TaxID from nucl_gb.accession2taxid file +virus_accession = pd.read_csv(args.input_refseq_acc, names=['accession.version']) +accession2taxid = pd.read_csv(args.input_nucl_gb, sep="\t") +result = virus_accession.merge(accession2taxid, on='accession.version', how='left') +result = result[['accession.version', 'taxid']] +result2 = result.dropna(subset=['taxid']) +result2.taxid = result2.taxid.astype(int) +taxid_to_trace = sorted(set(result2["taxid"])) + +# Function to backtrace until Family TaxID for a given TaxID + + +def taxatrace(tax_id, df): + path = [tax_id] + for i in range(len(df)): + row = df[df['TaxID'] == tax_id] + if len(row) == 0: + break + parent_id = row['ParentID'].values[0] + if tax_id != parent_id: + level = row['Level'].values[0] + if level == 'family': + path.append(parent_id) + break + else: + path.append(parent_id) + tax_id = parent_id + else: + break + return path + + +# Step 3: Backtracing the TaxIDs(2) until Family Level +nodes = pd.read_csv(args.input_nodes_dmp, + sep="|", usecols=[0, 1, 2], + names=['TaxID', 'ParentID', 'Level']) +nodes = nodes.replace('\t', '', regex=True) +path_trace = [] +for i in taxid_to_trace: + path = taxatrace(i, nodes) + path_trace.append(path) + +# Step 4: Filtering from Backtracing Step to only keep results of Families of Interest +filtered_list = [x for x in path_trace if any(y in x for y in fam_list)] +filtered_taxid = [element[0] for element in filtered_list] + +# Step 5: NCBI Accession List Filtering +filtered_accessions = result2[result2['taxid'].isin(filtered_taxid)] +filtered_accessions.to_csv(args.output_acc_path + "viral_families.tsv", + sep='\t', index=False) diff --git a/scripts/kraken_plot.py b/scripts/kraken_plot.py index 09a87bc..c973471 100644 --- a/scripts/kraken_plot.py +++ b/scripts/kraken_plot.py @@ -1,20 +1,23 @@ """ -Summarizes and performs plotting of Clustermap for the total reads counts assigned for each family and species by Kraken2. +Summarizes Kraken2 reports and plots Clustermap for +the total reads counts assigned to each family/species by Kraken2. -This script processes all the Kraken2.report.txt files Kraken2 tool and organizes the number of reads(fragments) -mapped to each family and species level. The output of the scripts are two tab-separated files consisting of reads -mapped to Family level and Species level respectively. Depending on the TSV files, a clustermap is plotted to analyze -the distribution of taxons in the all report files of the samples. The Clustermap are saved to PNG file in the directory -specified by the user. +This script processes all the Kraken2.report.txt files Kraken2 tool and +organizes the number of reads(fragments) mapped to each family and species +level. The output of the scripts are two tab-separated files consisting of +reads mapped to Family level and Species level respectively. Depending on +the TSV files, a clustermap is plotted to analyze the distribution of taxons +in the all report files of the samples. The Clustermap are saved to PNG file +in the directory specified by the user. -Usage: - ./kraken_plot.py -i -o +Usage: + ./kraken_plot.py -i -o Outputs: - - familywise_taxonomic_readcounts.tsv - - specieswise_taxonomic_readcounts.tsv - - clustermap_familywise_log10.png - - clustermap_specieswise_log10.png + - Familywise_tax_readcounts.tsv + - Specieswise_tax_readcounts.tsv + - Clustermap_Familywise_log10.png + - Clustermap_Specieswise_log10.png Returns: None. The script saves all the files to a output directory. @@ -22,8 +25,8 @@ Author: Saim Momin -Last Updated: - 06-04-2023 +Last Updated: + 07-07-2023 """ @@ -31,85 +34,118 @@ import seaborn as sns import numpy as np import matplotlib.pyplot as plt -import scipy -import glob import os import argparse parser = argparse.ArgumentParser() -parser.add_argument("-i", "--input_file_directory", metavar="PATH", help="Path to master directory Kraken2 report files") -parser.add_argument("-o", "--output_file_directory", metavar="PATH", help="Path to directory for output files") +parser.add_argument("-i", "--input_file_directory", + metavar="PATH", + help="Path to master directory Kraken2 report files") + +parser.add_argument("-o", "--output_file_directory", + metavar="PATH", + help="Path to directory for output files") + args = parser.parse_args() -path = args.input_file_directory -dirs = [os.path.join(path, d) for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] -kraken_files = [] +path = args.input_file_directory family_data = pd.DataFrame(columns=['Taxon']) species_data = pd.DataFrame(columns=['Taxon']) +kraken_files = [] + +for root, dirs, files in os.walk(path): + for file in files: + if file.endswith(".report.txt"): + file_path = os.path.join(root, file) + kraken_files.append(file_path) + +print("\n --- Kraken2 Reports Summarization Script ---\n") +print("\n Specified Input directory: ", args.input_file_directory) +print("\n Specified Output directory: ", args.output_file_directory) +print("\n Total Kraken2 Reports Detected: ", len(kraken_files)) -for directory in dirs: - kraken_files.extend(glob.glob(os.path.join(directory, '*.txt'))) - for file in kraken_files: - df = pd.read_csv(file, sep='\t', names=["Perc", "Reads_covered", "Reads_Assigned", "Order", "Tax_ID", "Taxon"]) - df1 = df.loc[df['Order'] == 'F'].sort_values("Reads_covered", ascending=False) #Fetching Families rows - df2 = df.loc[df['Order'] == 'S'].sort_values("Reads_covered", ascending=False) #Fetching Species rows - + df = pd.read_csv(file, sep='\t', + names=["Perc", "Reads_covered", "Reads_Assigned", "Order", + "Tax_ID", "Taxon"]) + # Fetching Families and Species rows + df1 = df.loc[df['Order'] == 'F'].sort_values("Reads_covered", + ascending=False) + df2 = df.loc[df['Order'] == 'S'].sort_values("Reads_covered", + ascending=False) + df1 = df1[['Taxon', 'Reads_covered']] df2 = df2[['Taxon', 'Reads_covered']] - - df1['Taxon'] = df1['Taxon'].str.replace('\s+', '', regex=True) #Removing whitespaces + + # Removing whitespaces + df1['Taxon'] = df1['Taxon'].str.replace('\s+', '', regex=True) df2['Taxon'] = df2['Taxon'].str.replace('\s+', '', regex=True) - + + # Changing column name to filename filename = os.path.basename(file).split(".")[0] - df1 = df1.rename(columns={'Reads_covered': filename}) #Changing column name to filename + df1 = df1.rename(columns={'Reads_covered': filename}) df2 = df2.rename(columns={'Reads_covered': filename}) - - family_data = pd.merge(family_data, df1, on='Taxon', how='outer') #Merging out all columns + + # Merging out all columns + family_data = pd.merge(family_data, df1, on='Taxon', how='outer') species_data = pd.merge(species_data, df2, on='Taxon', how='outer') - -cols = [family_data.columns[0]] + sorted(family_data.columns[1:]) #Sorting the columns + +# Sorting the columns +cols = [family_data.columns[0]] + sorted(family_data.columns[1:]) cols2 = [species_data.columns[0]] + sorted(species_data.columns[1:]) family_data = family_data[cols] species_data = species_data[cols2] +family_data.to_csv(args.output_file_directory + + "Familywise_tax_readcounts.tsv", sep='\t', + index=False) +species_data.to_csv(args.output_file_directory + + "Specieswise_tax_readcounts.tsv", sep='\t', + index=False) + +print("\n Processing Completed! Now Plotting Clustermaps") -family_data.to_csv(args.output_file_directory + "familywise_taxonomic_readcounts.tsv", sep='\t', index=False) -species_data.to_csv(args.output_file_directory + "specieswise_taxonomic_readcounts.tsv", sep='\t', index=False) - # --- Plotting Clustermap for Taxon --- family_map = family_data.set_index("Taxon") -family_map_log10 = family_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) #Log10 transformation -family_map_cleaned = family_map_log10.fillna(0) #Filling missing values -sns.set(rc={"figure.figsize": (80,60)}) +family_map_log10 = family_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) +family_map_cleaned = family_map_log10.fillna(0) +sns.set(rc={"figure.figsize": (80, 60)}) sns.set(font_scale=0.6) -g = sns.clustermap(family_map_cleaned, cmap="coolwarm", xticklabels=True, yticklabels=True) +g = sns.clustermap(family_map_cleaned, + cmap="coolwarm", + xticklabels=True, + yticklabels=True) g.ax_heatmap.yaxis.set_tick_params(labelsize=4) plt.title("Family-Wise Clustermap") -plt.suptitle("Family-Wise Clustermap", ha="center", va="center", fontsize=14, y=1.0) +plt.suptitle("Family-Wise Clustermap", + ha="center", va="center", fontsize=14, y=1.0) plt.ylabel("Read Counts (log10)") -g.savefig(args.output_file_directory + "clustermap_familywise_log10.png", dpi=1200) +g.savefig(args.output_file_directory + + "Clustermap_Familywise_log10.png", dpi=1200) plt.show() # --- Plotting Clustermap for Top-10 Species --- -species_data['maximum'] = species_data.max(axis=1,numeric_only=True) #Getting maximum reads -sorted_species_data = species_data.sort_values(by = 'maximum', ascending = False) +species_data['maximum'] = species_data.max(axis=1, numeric_only=True) +sorted_species_data = species_data.sort_values(by='maximum', ascending=False) top_10_species = sorted_species_data.head(10) species_map = top_10_species.set_index("Taxon") -species_map_log10 = species_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) #Log10 transformation -species_map_cleaned = species_map_log10.fillna(0) #Filling missing values +species_map_log10 = species_map.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x) +species_map_cleaned = species_map_log10.fillna(0) species_map_data = species_map_cleaned.loc[:, species_map_cleaned.columns != "maximum"] -sns.set(rc={"figure.figsize": (80,60)}) +sns.set(rc={"figure.figsize": (80, 60)}) sns.set(font_scale=0.6) -g = sns.clustermap(species_map_data, cmap="coolwarm", xticklabels=True, yticklabels=True) +g = sns.clustermap(species_map_data, cmap="coolwarm", + xticklabels=True, yticklabels=True) g.ax_heatmap.yaxis.set_tick_params(labelsize=4) plt.title("Species-Wise Clustermap") -plt.suptitle("Species-Wise Clustermap (Top 10 Species)", ha="center", va="center", fontsize=14, y=1.0) +plt.suptitle("Species-Wise Clustermap (Top 10 Species)", + ha="center", va="center", fontsize=14, y=1.0) plt.ylabel("Read Counts (log10)") -g.savefig(args.output_file_directory + "clustermap_specieswise_log10.png", dpi=1200) +g.savefig(args.output_file_directory + + "Clustermap_Specieswise_log10.png", dpi=1200) plt.show() -print("--- Script Completed Successfully ---") \ No newline at end of file +print("\n--- Script Completed Successfully! ---\n") diff --git a/scripts/synapse_fetch.py b/scripts/synapse_fetch.py index 6446806..fde04b1 100644 --- a/scripts/synapse_fetch.py +++ b/scripts/synapse_fetch.py @@ -1,7 +1,8 @@ """ -This script logs into Synapse portal and walks through a parent folder to get a list of all entities present in it. -It then preprocesses the list into a Pandas DataFrame along with directory information. +This script logs into Synapse portal and walks through a parent folder +to get a list of all entities present in it. It then preprocesses the +list into a Pandas DataFrame along with directory information. Usage: ./synapse_fetch.py -i @@ -11,14 +12,14 @@ This script requires the following modules to be imported: - pandas - synapseutils - - synapseclient + - synapseclient Output: - synapse_ids.tsv A tab seperated file consisting of following columns - - Directory: Directory name for a file stored in Synapse portal for a ParentID + - Directory: Directory name for file stored in Synapse portal - ParentID: ParentID of the Directory - - Filename: Name of the file + - Filename: Name of the file - EntityID: SynapseID for the entities under the parent Returns: @@ -27,22 +28,33 @@ Author: Saim Momin -Last Updated: - 12-04-2023 - -""" +Last Updated: + 07-07-2023 +""" +import os +import sys import pandas as pd import synapseutils import synapseclient import argparse +import warnings parser = argparse.ArgumentParser() -parser.add_argument("-i", "--input_synapse_id", metavar="ID", help="Synapse ID for Query") +parser.add_argument("-i", "--input_synapse_id", + metavar="ID", + help="Synapse ID for Query") +parser.add_argument("-o", "--output_file_dir", + metavar="PATH", + help="Path to output the Fetched Data") args = parser.parse_args() -with open("/home/momin/.synapseConfig") as f: +if not os.path.exists(os.path.expanduser("~/.synapseConfig")): + raise FileNotFoundError("The Synapse Configuration file not found. Exiting!") + sys.exit(0) + +with open(os.path.expanduser("~/.synapseConfig")) as f: file = f.read().strip().split("\n") for i in file: if i.startswith("username"): @@ -51,9 +63,16 @@ token = str(i[12::]) syn = synapseclient.Synapse() -syn.login(email=user, authToken=token) - -#Walking throw the Parent-ID and getting all the entities present in them +try: + syn.login(email=user, authToken=token) + print("Connection Established Successfully!") +except Exception as e: + print("\nConnection failed:", str(e)) + warnings.simplefilter("ignore", UserWarning) + print('\nPlease Check your Login Credentials. Exiting!!') + +# Walking throw the Parent-ID and getting all the entities present in them +print("Retrieving the Requested Data...") file_list = [] test2 = synapseutils.walk(syn, args.input_synapse_id) for dirpath,dirname, filename in test2: @@ -61,23 +80,18 @@ file_info = {'dir': dirpath, 'file': f} file_list.append(file_info) -#Preprocessing for the fetched directory and file list from Parent ID +# Preprocessing for the fetched directory and file list from Parent ID df = pd.DataFrame(file_list) df1 = df.applymap(lambda x: str(x).replace("'", "").replace("(", "").replace(")", "")) -df1[['Directory', 'ParentID']] = df1['dir'].str.split(',', expand=True) -df1[['Filename', 'EntityID']] = df1['file'].str.split(',', expand=True) -df1.drop(['dir', 'file'],axis=1, inplace=True) -df1.to_csv("synapse_ids.tsv", sep="\t", index=False) - -#TODO: Get the list of Synapse ids only with .fastq.gz command - - - - -#TODO: Work on the downloading part and storing it in the directory. Possibly by multithreading approach (Discuss?) - - - - - - +df1[['Directory', 'ParentID']] = df1['dir'].str.split(',', expand=True) +df1[['Samplename', 'SynapseID']] = df1['file'].str.split(',', expand=True) +df1.drop(['dir', 'file'], axis=1, inplace=True) +df1.to_csv(args.output_file_dir + "synapse_ids_all.tsv", sep="\t", index=False) + +# Writing Sample and corresponding Synapse IDs +filtered_df = df1[df1['Samplename'].str.contains('R1|R2')] +filtered_df = filtered_df[['Samplename', 'SynapseID']] +filtered_df.to_csv(args.output_file_dir + "synapse_fastqs_ids.tsv", + sep="\t", index=False) + +print("\nRetrieving Data Successfull...") From 0337b71809e475f03cb82477a4168779b6be5529 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 10 Jul 2023 16:58:39 +0200 Subject: [PATCH 23/33] Minor Changes --- scripts/count_matrix_processing.py | 29 ----- scripts/synapse_id.tsv | 196 ----------------------------- 2 files changed, 225 deletions(-) delete mode 100644 scripts/count_matrix_processing.py delete mode 100644 scripts/synapse_id.tsv diff --git a/scripts/count_matrix_processing.py b/scripts/count_matrix_processing.py deleted file mode 100644 index 20658a0..0000000 --- a/scripts/count_matrix_processing.py +++ /dev/null @@ -1,29 +0,0 @@ -import os - -directory_path = '/data/manke/processing/momin/virome-scan/DGE/EBV/count_matrix' - -tsv_files = [] -for root, dirs, files in os.walk(directory_path): - for file in files: - if file.endswith('.tsv'): - tsv_files.append(os.path.join(root, file)) - -taxon_df = pd.DataFrame(columns=['Tax_ID']) -for file in tsv_files: - df = pd.read_csv(file, sep='\t') - taxon = df['Tax_ID'] - taxon_df = pd.merge(taxon_df,taxon,on='Tax_ID', how='outer') - - -for file in tsv_files: - df2 = pd.read_csv(file, sep='\t') - samplename = str(file.split('/')[-2]) - summed_data = df2.groupby('Tax_ID').sum().iloc[:, :].sum(axis=1).to_frame(samplename) - taxon_df = pd.merge(taxon_df, summed_data, on='Tax_ID', how='left') - -taxon_df = taxon_df.fillna(0) -taxon_df['Tax_ID'] = taxon_df['Tax_ID'].str.split('(').str[0].str.strip() -taxon_df.iloc[:, 1:] = taxon_df.iloc[:, 1:].astype(int) -taxon_df = taxon_df.rename(columns={'Tax_ID': 'Taxon'}) -taxon_df.to_csv("/home/momin/count_matrix_summarized.tsv", sep='\t', index=False) -taxon_df \ No newline at end of file diff --git a/scripts/synapse_id.tsv b/scripts/synapse_id.tsv deleted file mode 100644 index e8e772a..0000000 --- a/scripts/synapse_id.tsv +++ /dev/null @@ -1,196 +0,0 @@ -Directory ParentID Filename EntityID -Data/Gene Expression/Gene Expression scRNA seq/counts - CITEseq syn26560200 S33.zip syn24610356 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S1.zip syn24610321 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S10.zip syn24610331 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S11.zip syn24610333 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S12.zip syn24610334 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S13.zip syn24610335 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S14.zip syn24610337 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S15.zip syn24610338 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S16.zip syn24610339 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S17.zip syn24610340 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S18.zip syn24610341 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S19.zip syn24610342 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S2.zip syn24610322 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S20.zip syn24610343 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S21.zip syn24610344 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S22.zip syn24610345 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S23.zip syn24610346 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S24.zip syn24610347 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S25.zip syn24610348 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S26.zip syn24610349 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S27.zip syn24610350 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S28.zip syn24610351 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S29.zip syn24610352 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S3.zip syn24610323 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S30.zip syn24610353 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S31.zip syn24610354 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S32.zip syn24610355 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S4.zip syn24610324 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S5.zip syn24610325 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S6.zip syn24610327 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S7.zip syn24610328 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S8.zip syn24610329 -Data/Gene Expression/Gene Expression scRNA seq/counts - scRNAseq syn24171157 S9.zip syn24610330 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_I1_001.fastq.gz syn26534563 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_R1_001.fastq.gz syn26534582 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-ADT-ATCACGAT_S29_L001_R2_001.fastq.gz syn26534579 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_I1_001.fastq.gz syn26534567 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_R1_001.fastq.gz syn26534585 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-HTO-ATTACTCG_S30_L001_R2_001.fastq.gz syn26534584 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_I1_001.fastq.gz syn26534562 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_R1_001.fastq.gz syn26534575 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-AGCATCCG_S26_L001_R2_001.fastq.gz syn26534574 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_I1_001.fastq.gz syn26534565 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_R1_001.fastq.gz syn26534580 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-CCTCATTC_S25_L001_R2_001.fastq.gz syn26534576 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_I1_001.fastq.gz syn26534564 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_R1_001.fastq.gz syn26534578 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-GTGGCAAT_S27_L001_R2_001.fastq.gz syn26534577 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_I1_001.fastq.gz syn26534566 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_R1_001.fastq.gz syn26534573 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - CITEseq syn26560199 20190110-cDNA-TAATGGGA_S28_L001_R2_001.fastq.gz syn26534572 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_I1_001.fastq.gz syn26534419 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_R1_001.fastq.gz syn26534428 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 268_1970_positive_S4_L004_R2_001.fastq.gz syn26534444 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_I1_001.fastq.gz syn26534424 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_R1_001.fastq.gz syn26534431 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 271_1971_CD45positive_S1_L001_R2_001.fastq.gz syn26534446 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_I1_001.fastq.gz syn26534420 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_R1_001.fastq.gz syn26534427 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 286_1976_cells_positive_S3_L003_R2_001.fastq.gz syn26534441 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_I1_001.fastq.gz syn26534426 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_R1_001.fastq.gz syn26534430 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 289_005_positive_S4_L004_R2_001.fastq.gz syn26534445 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_I1_001.fastq.gz syn26534425 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_R1_001.fastq.gz syn26534429 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 293_008_cells_positive_S5_L005_R2_001.fastq.gz syn26534447 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_I1_001.fastq.gz syn26534423 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_R1_001.fastq.gz syn26534442 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 294_009_cells_positive_S1_L001_R2_001.fastq.gz syn26534458 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_I1_001.fastq.gz syn26534422 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_R1_001.fastq.gz syn26534440 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 299_166-cells-positive_S2_L002_R2_001.fastq.gz syn26534457 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_I1_001.fastq.gz syn26534421 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_R1_001.fastq.gz syn26534437 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 301_014-cells-positive_S3_L003_R2_001.fastq.gz syn26534456 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_I1_001.fastq.gz syn26534432 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_R1_001.fastq.gz syn26534439 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 303_1991-cells-positive_S4_L004_R2_001.fastq.gz syn26534453 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_I1_001.fastq.gz syn26534433 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_R1_001.fastq.gz syn26534452 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 305_015-cells-positive_S5_L005_R2_001.fastq.gz syn26534462 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_I1_001.fastq.gz syn26534434 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_R1_001.fastq.gz syn26534450 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 325_2013_CD45_positive_S1_L001_R2_001.fastq.gz syn26534459 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_I1_001.fastq.gz syn26534435 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_R1_001.fastq.gz syn26534451 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 339_23_pos_cells_S3_L003_R2_001.fastq.gz syn26534465 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_I1_001.fastq.gz syn26534436 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_R1_001.fastq.gz syn26534460 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 345_2017-CD45_pos_S4_L004_R2_001.fastq.gz syn26534464 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_I1_001.fastq.gz syn26534438 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_R1_001.fastq.gz syn26534448 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 346_2019_CD45_pos_S1_L001_R2_001.fastq.gz syn26534463 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_I1_001.fastq.gz syn26534454 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_R1_001.fastq.gz syn26534449 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 355_2015_CD45_pos_S4_L004_R2_001.fastq.gz syn26534455 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_I1_001.fastq.gz syn26534466 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_R1_001.fastq.gz syn26534461 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 356_CK030_corticol-CD45-pos_S1_L001_R2_001.fastq.gz syn26534485 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_I1_001.fastq.gz syn26534467 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_R1_001.fastq.gz syn26534474 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 362_CK031_cortex_CD45-pos_S2_L002_R2_001.fastq.gz syn26534484 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_I1_001.fastq.gz syn26534468 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_R1_001.fastq.gz syn26534472 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 363_VA_2028-pos_S3_L003_R2_001.fastq.gz syn26534483 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_I1_001.fastq.gz syn26534469 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_R1_001.fastq.gz syn26534471 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 364_2030-CD45-pos_S4_L004_R2_001.fastq.gz syn26534481 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_I1_001.fastq.gz syn26534475 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_R1_001.fastq.gz syn26534470 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 365_CK033_S5_L005_R2_001.fastq.gz syn26534487 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_I1_001.fastq.gz syn26534488 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_R1_001.fastq.gz syn26534482 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 366_1996T_CD45-pos_S6_L006_R2_001.fastq.gz syn26534486 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_I1_001.fastq.gz syn26534489 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_R1_001.fastq.gz syn26534497 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 367_2037-CD45-pos_S7_L007_R2_001.fastq.gz syn26534507 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_I1_001.fastq.gz syn26534491 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_R1_001.fastq.gz syn26534499 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 369_182_positive_S8_L008_R2_001.fastq.gz syn26534508 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_I1_001.fastq.gz syn26534490 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_R1_001.fastq.gz syn26534495 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 375_VA_2028_pos_S3_L003_R2_001.fastq.gz syn26534506 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_I1_001.fastq.gz syn26534570 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_R1_001.fastq.gz syn26534581 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 376_2030_CD45_pos_S4_L004_R2_001.fastq.gz syn26534586 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_I1_001.fastq.gz syn26534571 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_R1_001.fastq.gz syn26534583 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 381_BC_155_Tumor_CD45pos_S1_L001_R2_001.fastq.gz syn26534587 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_I1_001.fastq.gz syn26534494 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_R1_001.fastq.gz syn26534502 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 383_sample_cell_line_553_S3_L003_R2_001.fastq.gz syn26534509 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_I1_001.fastq.gz syn26534518 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_R1_001.fastq.gz syn26534540 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-ACAAGGTA_S54_L002_R2_001.fastq.gz syn26534537 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_I1_001.fastq.gz syn26534519 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_R1_001.fastq.gz syn26534538 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-CGTCCCGT_S55_L002_R2_001.fastq.gz syn26534536 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_I1_001.fastq.gz syn26534521 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_R1_001.fastq.gz syn26534544 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-GTGGTACC_S52_L002_R2_001.fastq.gz syn26534542 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_I1_001.fastq.gz syn26534523 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_R1_001.fastq.gz syn26534546 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CK-045-CD45-scRNAseq-3-TACTATAG_S53_L002_R2_001.fastq.gz syn26534545 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_I1_001.fastq.gz syn26534492 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_R1_001.fastq.gz syn26534501 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L001_R2_001.fastq.gz syn26534504 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_I1_001.fastq.gz syn26534514 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_R1_001.fastq.gz syn26534500 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L002_R2_001.fastq.gz syn26534512 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_I1_001.fastq.gz syn26534510 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_R1_001.fastq.gz syn26534503 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L003_R2_001.fastq.gz syn26534505 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_I1_001.fastq.gz syn26534493 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_R1_001.fastq.gz syn26534511 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 CL402JF_S1_L004_R2_001.fastq.gz syn26534530 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_I1_001.fastq.gz syn26534516 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_R1_001.fastq.gz syn26534534 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-ATCTCTGT_S45_L002_R2_001.fastq.gz syn26534529 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_I1_001.fastq.gz syn26534513 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_R1_001.fastq.gz syn26534533 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-CCTAGACC_S44_L002_R2_001.fastq.gz syn26534531 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_I1_001.fastq.gz syn26534515 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_R1_001.fastq.gz syn26534522 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-GGAGAGAG_S47_L002_R2_001.fastq.gz syn26534520 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_I1_001.fastq.gz syn26534517 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_R1_001.fastq.gz syn26534535 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2058-CD45-10x-3-TAGCTCTA_S46_L002_R2_001.fastq.gz syn26534532 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_I1_001.fastq.gz syn26534524 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_R1_001.fastq.gz syn26534551 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-ATCATGCA_S50_L002_R2_001.fastq.gz syn26534548 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_I1_001.fastq.gz syn26534525 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_R1_001.fastq.gz syn26534549 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-CCGGGTAT_S51_L002_R2_001.fastq.gz syn26534547 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_I1_001.fastq.gz syn26534527 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_R1_001.fastq.gz syn26534556 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-GGTTCCTC_S49_L002_R2_001.fastq.gz syn26534555 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_I1_001.fastq.gz syn26534528 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_R1_001.fastq.gz syn26534553 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2064-CD45-10x-3-TAACAAGG_S48_L002_R2_001.fastq.gz syn26534550 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_I1_001.fastq.gz syn26534526 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_R1_001.fastq.gz syn26534541 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-AGGTATTG_S56_L002_R2_001.fastq.gz syn26534539 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_I1_001.fastq.gz syn26534543 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_R1_001.fastq.gz syn26534552 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-CTCCTAGT_S57_L002_R2_001.fastq.gz syn26534558 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_I1_001.fastq.gz syn26534557 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_R1_001.fastq.gz syn26534559 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-GATGCCAA_S59_L002_R2_001.fastq.gz syn26534561 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_I1_001.fastq.gz syn26534560 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_R1_001.fastq.gz syn26534569 -Data/Gene Expression/Gene Expression scRNA seq/fastqs - scRNAseq syn26560198 VA-2065-CD45-10x-3-TCAAGGCC_S58_L002_R2_001.fastq.gz syn26534568 -Data/Metadata syn24168324 HBI_scRNAseq_assay_scrnaSeq_metadata.csv syn24610436 -Data/Metadata syn24168324 HBI_scRNAseq_biospecimen_metadata.csv syn24610438 -Data/Metadata syn24168324 HBI_scRNAseq_individual_metadata.csv syn24610550 From e302a81dce7de007cb1952c353f09f0b9afe7db3 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 10 Jul 2023 17:12:28 +0200 Subject: [PATCH 24/33] Code Cleanup --- .snakemake-workflow-catalog.yml | 11 ----------- .template/config/config.yaml.tmpl.tmpl | 1 - .template/workflow/Snakefile.tmpl.tmpl | 10 ---------- 3 files changed, 22 deletions(-) delete mode 100644 .snakemake-workflow-catalog.yml delete mode 100644 .template/config/config.yaml.tmpl.tmpl delete mode 100644 .template/workflow/Snakefile.tmpl.tmpl diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml deleted file mode 100644 index 3436e32..0000000 --- a/.snakemake-workflow-catalog.yml +++ /dev/null @@ -1,11 +0,0 @@ -# configuration of display in snakemake workflow catalog: https://snakemake.github.io/snakemake-workflow-catalog - -usage: - mandatory-flags: # optional definition of additional flags - desc: # describe your flags here in a few sentences (they will be inserted below the example commands) - flags: # put your flags here - software-stack-deployment: # definition of software deployment method (at least one of conda, singularity, or singularity+conda) - conda: true # whether pipeline works with --use-conda - singularity: false # whether pipeline works with --use-singularity - singularity+conda: false # whether pipeline works with --use-singularity --use-conda - report: true # add this to confirm that the workflow allows to use 'snakemake --report report.zip' to generate a report containing all results and explanations \ No newline at end of file diff --git a/.template/config/config.yaml.tmpl.tmpl b/.template/config/config.yaml.tmpl.tmpl deleted file mode 100644 index 0850e6f..0000000 --- a/.template/config/config.yaml.tmpl.tmpl +++ /dev/null @@ -1 +0,0 @@ -# add template configuration file (may use variables from copier.yml) \ No newline at end of file diff --git a/.template/workflow/Snakefile.tmpl.tmpl b/.template/workflow/Snakefile.tmpl.tmpl deleted file mode 100644 index 65613a9..0000000 --- a/.template/workflow/Snakefile.tmpl.tmpl +++ /dev/null @@ -1,10 +0,0 @@ -configfile: "config/config.yaml" - -module [[ module_name ]]: - snakefile: - # TODO replace with desired release - "https://github.com/[[ owner ]]/[[ repo ]]/raw//Snakefile" - config: - config - -use rule * from [[ module_name ]] \ No newline at end of file From be49978748730bec32707d1708445b7352de70c0 Mon Sep 17 00:00:00 2001 From: Saim Date: Fri, 28 Jul 2023 12:21:21 +0200 Subject: [PATCH 25/33] Added Synpase Integration with pipeline --- Snakefile | 4 ++-- config.yaml | 4 ++-- metadata.tsv | 13 +++++++++++++ rules/cellranger.smk | 2 +- rules/download_samples_or_copy.smk | 26 ++++++++++++++++++++------ rules/extract_tags.smk | 2 +- rules/kraken2_mapping.smk | 7 ++++--- scripts/synapse_fetch.py | 15 ++++++++++++++- 8 files changed, 57 insertions(+), 16 deletions(-) create mode 100644 metadata.tsv diff --git a/Snakefile b/Snakefile index 46da13b..6bf420d 100644 --- a/Snakefile +++ b/Snakefile @@ -4,7 +4,7 @@ import pandas as pd configfile: "config.yaml" -accession_list = pd.read_table("SRA.tsv") +accession_list = pd.read_table("metadata.tsv") samples = list(accession_list.Samples.unique()) @@ -21,7 +21,7 @@ rule all: expand("data/{sample}_test.txt",sample=samples), expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples), - expand("results/kraken2/{sample}/{sample}_classified_out.fastq",sample=samples), + #expand("results/kraken2/{sample}/{sample}_classified_out.fastq",sample=samples), expand("results/cellranger/{sample}/{sample}_finished.txt",sample=samples), expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) diff --git a/config.yaml b/config.yaml index 239b286..ba72732 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,4 @@ -"files" : None +"files" : "synapse" "transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" "local_files_dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_pos_control/SRR13114612" -"kraken_db": "/data/repository/kraken2_contaminome/virus_db2023" \ No newline at end of file +"kraken_db": "/data/manke/processing/momin/virome-scan/sc-virome-scan/db/t2tDB/t2tDB/" \ No newline at end of file diff --git a/metadata.tsv b/metadata.tsv new file mode 100644 index 0000000..31ad192 --- /dev/null +++ b/metadata.tsv @@ -0,0 +1,13 @@ +Samples R1 R2 +D17-8765_S1L1 syn18641014 syn18641249 +D17-8765_S1L2 syn18641325 syn18641475 +D17-8765_S1L3 syn18641515 syn18641599 +D17-8765_S1L4 syn18641650 syn18641733 +D17-8766_S2L1 syn18641776 syn18641855 +D17-8766_S2L2 syn18641922 syn18641969 +D17-8766_S2L3 syn18642006 syn18642053 +D17-8766_S2L4 syn18642092 syn18642140 +D17-8767_S3L1 syn18641871 syn18641934 +D17-8767_S3L2 syn18641881 syn18641929 +D17-8767_S3L3 syn18641880 syn18641930 +D17-8767_S3L4 syn18641872 syn18641927 \ No newline at end of file diff --git a/rules/cellranger.smk b/rules/cellranger.smk index 1f6cefa..ec2159e 100644 --- a/rules/cellranger.smk +++ b/rules/cellranger.smk @@ -2,7 +2,7 @@ rule cellranger: input: i1 = expand("data/{sample}_test.txt", sample=samples) output: - o1 = temp("results/cellranger/{sample}/{sample}_finished.txt") + o1 = "results/cellranger/{sample}/{sample}_finished.txt" priority: 90 resources: mem_mb = 26000 diff --git a/rules/download_samples_or_copy.smk b/rules/download_samples_or_copy.smk index 57a0921..7ba9146 100644 --- a/rules/download_samples_or_copy.smk +++ b/rules/download_samples_or_copy.smk @@ -1,22 +1,36 @@ rule download_samples_or_copy: output: o1 = "data/{sample}/{sample}_S1_L001_R1_001.fastq.gz", - o2 = "data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", + o2 = "data/{sample}/{sample}_S1_L001_R2_001.fastq.gz" params: - outdir = "data/{sample}/" + outdir = "data/{sample}/", + filetype = config["files"] threads: 16 + resources: + mem_mb = 20000 log: "results/logs/download_samples_or_copy/{sample}.log" priority: 100 shell: """ - if [ {config[files]} == 'local' ]; then + filetype="{params.filetype}" + if [ $filetype == "local" ]; then for file in {wildcards.sample}*; do ln -s {config[local_files_dir]}/$file data/$file done + + elif [ $filetype == "synapse" ]; then + result=$(grep "{wildcards.sample}" "metadata.tsv") + R1=$(echo "$result" | awk '{{print $2}}') + R2=$(echo "$result" | awk '{{print $3}}') + synapse get $R1 --multiThreaded --downloadLocation {params.outdir} + synapse get $R2 --multiThreaded --downloadLocation {params.outdir} + mv data/{wildcards.sample}/*_R1_* data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz + mv data/{wildcards.sample}/*_R2_* data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz + else parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip --tmpdir /data/manke/processing/momin/virome-scan/sc-virome-scan/tmp - mv data/{wildcards.sample}/{wildcards.sample}_1.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz - mv data/{wildcards.sample}/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz - fi + mv data/{wildcards.sample}/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz + mv data/{wildcards.sample}/{wildcards.sample}_3.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz + fi """ diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index 514aedf..47763ee 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -11,6 +11,6 @@ rule extract_tags: p1 = "results/count_matrix/{sample}/", p2 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" log: - "results/count_matrix/{sample}_bam_extract.log" + "results/logs/count_matrix/{sample}_bam_extract.log" shell: "python3 scripts/bam_extract.py -i {input.i1} -k {input.i2} -b {params.p2} -o {params.p1}" \ No newline at end of file diff --git a/rules/kraken2_mapping.smk b/rules/kraken2_mapping.smk index 92dd1f2..e3c48b2 100644 --- a/rules/kraken2_mapping.smk +++ b/rules/kraken2_mapping.smk @@ -4,9 +4,11 @@ rule kraken2_mapping: output: o1 = "results/kraken2/{sample}/{sample}.kraken", o2 = "results/kraken2/{sample}/{sample}.report.txt", - o3 = "results/kraken2/{sample}/{sample}_classified_out.fastq", + #o3 = "results/kraken2/{sample}/{sample}_classified_out.fastq", o4 = temp("data/{sample}_test.txt") priority: 95 + resources: + mem_mb = 20000 threads: 16 params: p1 = config["kraken_db"] @@ -14,7 +16,6 @@ rule kraken2_mapping: "results/logs/kraken2/{sample}.kraken.log" shell: """ - kraken2 --use-names --threads {threads} --db {params.p1} --report {output.o2} --output {output.o1} --classified-out {output.o3} {input.i1} 2> {log} - gzip {output.o3} + kraken2 --use-names --threads {threads} --db {params.p1} --report {output.o2} --output {output.o1} {input.i1} 2> {log} touch {output.o4} """ \ No newline at end of file diff --git a/scripts/synapse_fetch.py b/scripts/synapse_fetch.py index fde04b1..e1463d7 100644 --- a/scripts/synapse_fetch.py +++ b/scripts/synapse_fetch.py @@ -91,7 +91,20 @@ # Writing Sample and corresponding Synapse IDs filtered_df = df1[df1['Samplename'].str.contains('R1|R2')] filtered_df = filtered_df[['Samplename', 'SynapseID']] -filtered_df.to_csv(args.output_file_dir + "synapse_fastqs_ids.tsv", +filtered_df['Samples'] = filtered_df['Samplename'].str.replace(r'_R[12].*', '', regex=True) +filtered_df2 = filtered_df[["Samples", "SynapseID"]] +filtered_df2['count'] = filtered_df2.groupby('Samples').cumcount() + 1 +df2_pivoted = filtered_df2.pivot(index='Samples', columns='count', + values='SynapseID') +df2_pivoted.columns = [f"R{i}" for i in df2_pivoted.columns] +df2_pivoted = df2_pivoted.reset_index() +df2_pivoted['Samples'] = df2_pivoted['Samples'].str.replace(r'_L(\d+)', r'L\1', + regex=True) +df2_pivoted['Samples'] = df2_pivoted['Samples'].str.replace(r'S1_', 'S1', + regex=True) +df2_pivoted['Samples'] = df2_pivoted['Samples'].str.replace(r'L0*(\d+)', r'L\1', + regex=True) +df2_pivoted.to_csv(args.output_file_dir + "metadata.tsv", sep="\t", index=False) print("\nRetrieving Data Successfull...") From 3230f7154e58cfb5d42971a7dd37d7e63e529751 Mon Sep 17 00:00:00 2001 From: Saim Date: Fri, 28 Jul 2023 12:29:50 +0200 Subject: [PATCH 26/33] Added Synapse requirement in env.yaml --- env.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/env.yaml b/env.yaml index ba93358..1eb9404 100644 --- a/env.yaml +++ b/env.yaml @@ -9,4 +9,6 @@ dependencies: - entrez-direct - parallel-fastq-dump - pandas - - snakemake \ No newline at end of file + - snakemake + - pip: + - synapseclient \ No newline at end of file From 8b3d139562dc45d84d91ce57747ed354e4f6b499 Mon Sep 17 00:00:00 2001 From: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> Date: Fri, 28 Jul 2023 12:46:28 +0200 Subject: [PATCH 27/33] Update README.md --- README.md | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d77a4c7..e3ef86d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) -[![GitHub actions status](https://github.com/maxplanck-ie/sc-virome-scan/workflows/Flake Check/badge.svg?branch=dev)](https://github.com/maxplanck-ie/sc-virome-scan/actions?query=branch%3Amain+workflow%3ATests) + A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell @@ -9,6 +9,28 @@ RNA (scRNA) datasets to investigate the possible correlation between viral patho
+# Installation + +`git clone https://github.com/maxplanck-ie/sc-virome-scan.git` + +`cd sc-virome-scan` + +`mamba create -f env.yaml` + +## Usage + +> ***snakemake --cores 16 --configfile config.yaml --latency-wait 60 --profile mpislurm*** + +## Handling Multiple Input modes to pipeline +Currently, the pipeline supports Sequence Read Archive (SRA) and Synapse AD Portal input files for analysis. + +
+ +## Important Note For Synapse Data Analysis +In order to download and analyse data from Synapse Portal, make sure you have the `synpaseConfig` file located in `~/.synapseConfig` directory. This file has individual Username and Access Token in order to login in to Synapse programmatically (Automatically taken care by the pipeline) and download the data based on the user input. + +Additionally, the user needs to provide a ***metadata.tsv*** file (present in the base directory of the repository) which consists of Sample names along with its Read 1 and Read 2 Synapse IDs. The pipeline will use this metadata.tsv file for downloading and analysis steps downstream. + # DAG for the pipeline ![Graphviz Diagram](dag.png) @@ -16,10 +38,6 @@ RNA (scRNA) datasets to investigate the possible correlation between viral patho
-## Usage - -> ***snakemake --cores 16 --use-conda --configfile config.yaml --latency-wait 60*** - ### Note This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. From 31d46ffa3260e3a18a183319537bbe234d696a38 Mon Sep 17 00:00:00 2001 From: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> Date: Fri, 28 Jul 2023 12:54:09 +0200 Subject: [PATCH 28/33] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e3ef86d..fd7ee1d 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ RNA (scRNA) datasets to investigate the possible correlation between viral patho # Installation -`git clone https://github.com/maxplanck-ie/sc-virome-scan.git` +`git clone -b dev https://github.com/maxplanck-ie/sc-virome-scan.git` `cd sc-virome-scan` -`mamba create -f env.yaml` +`mamba create -f env.yaml -n sc-virome-scan` ## Usage From 89ac587ce6e4d8b08fac6a00d9d7649547d17695 Mon Sep 17 00:00:00 2001 From: Saim Date: Wed, 9 Aug 2023 15:08:27 +0200 Subject: [PATCH 29/33] Updated Repo with minor Changes --- README.md | 187 ++++++++++++++++++++++++++--- Snakefile | 25 ++-- config.yaml | 11 +- dag.png | Bin 23890 -> 24447 bytes env.yaml | 1 + rules/cellranger.smk | 19 +-- rules/download_samples_or_copy.smk | 9 +- rules/extract_bam.smk | 6 +- rules/extract_tags.smk | 8 +- rules/kraken2_mapping.smk | 9 +- rules/report.smk | 20 +++ scripts/synapse_fetch.py | 2 +- 12 files changed, 236 insertions(+), 61 deletions(-) create mode 100644 rules/report.smk diff --git a/README.md b/README.md index fd7ee1d..365c601 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,151 @@ -# sc-Virome-Scan: A Snakemake pipeline for detection of viruses in single-cell datasets. +# scVirusScan: A method for swift and accurate detection of viral pathogens in single-cell RNA datasets. + [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) +A method wrapped around Snakemake enabling accurate, sensitive and scalable detection of viral pathogens in single-cell +RNA datasets. The pipeline integrates the strengths of two standard approaches, a standard mapping based approach and a **Kraken2** k-mer based approach which provides rapid taxonomic classification.The output of the scVirusScan pipeline can be +integrated easily into existing single cell analysis frameworks (Seurat and Scanpy) which can provide +standardized and reliable way to scrutinize virus infections at the single cell level resolution. + -A method wrapped around Snakemake for swift, precise, and accurate detection of viral pathogens in single-cell -RNA (scRNA) datasets to investigate the possible correlation between viral pathogens and neurodegenerative diseases.
-# Installation +# Installation and Setup +1. Clone the Git Repository using: `git clone https://github.com/maxplanck-ie/sc-virome-scan.git` + +2. Once cloned, change the directory to sc-virus-scan: `cd sc-virus-scan` -`git clone -b dev https://github.com/maxplanck-ie/sc-virome-scan.git` +3. To install the dependencies of the pipeline a `env.yaml` file has been provided. The environment and dependencies can be installed using conda/mamba. -`cd sc-virome-scan` + `mamba env create -f env.yaml -n sc-virome-scan` -`mamba create -f env.yaml -n sc-virome-scan` +4. Additionally, this pipeline needs `CellRanger` tool. To install CellRanger please refer to https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/installation -## Usage -> ***snakemake --cores 16 --configfile config.yaml --latency-wait 60 --profile mpislurm*** +5. After installation of CellRanger, the path to CellRanger executable needs to be specified in `config.yaml` file. + +6. Finally, the `config.yaml` needs to be modified as per your system environment variables. More information about `config.yaml` along with its description can be found in the section below. + +
+ +### **A. Contents of `config.yaml` file** +``` +samplesheet : # Path to your samplesheet file. More details about Samplesheet schema can be found below + +mode : # Pipeline mode to execute. (synapse | sra | local) + +local_data_dir : # If you choose local as pipeline mode, specify here the directory path of the files. + +kraken_db: # Path to Custom KrakenDB. More details can be found below + +cellranger: # Path of CellRanger Executable. + +transcriptome : # Path to your human transcriptome required from CellRanger count + +scripts_dir: # Path to scripts directory present in base directory of the workflow +``` +
+ +### **B. Description about `config.yaml` file** +1. `samplesheet`: The pipeline requires a `samplesheet.tsv` file to carry out the analysis. The samplesheet is a Tab seperated file (TSV) that provides a blueprint for analysis. As the pipeline supports two modes for analysis, there are two different formats for samplesheet discussed below. This file is further used to download the files from SRA database and perform analysis on it. + + **i. SRA mode:** + For running the pipeline in SRA mode, the user needs to provide a list of SRA Ids as shown in the example below. This file is further used to download the files from SRA database and perform analysis on it. + ``` + Samples + SRR13419001 + SRR13419002 + SRR13419003 + SRR13419004 + SRR13419005 + ``` +
+ + **ii. Synapse mode:** + For running the pipeline in synapse mode, user needs to generate the samplesheet with the help of `synapse_fetch.py` script present in the `scripts` directory of the pipeline. This scripts takes a Parent SynapseID as input and programmatically queries in the Synapse Server to retreive all the associate FASTQ files under the provided SynapseID and finally returns a Tab-Sepated file consisting of SampleName, Read1 SynapseID, Read2 SynapseID as shown below. This file is further used to download the files from Synapse and perform analysis on it. + ``` + Samples R1 R2 + D17-8765_S1L1 syn18641014 syn18641249 + D17-8765_S1L2 syn18641325 syn18641475 + D17-8765_S1L3 syn18641515 syn18641599 + D17-8765_S1L4 syn18641650 syn18641733 + D17-8766_S2L1 syn18641776 syn18641855 + ``` + + **iii. Local mode:** + For running the pipeline in local mode (ie.Files are already downloaded prior to analysis), the user needs to specify the Sample names in `samplesheet.tsv` file. Along with this, the user has to provide the path to the directory where the files are present in the `config.yaml` file under `local_data_dir` key. + +
+2. `mode` **(SRA | synapse):** + At present, the pipeline caters to two distinct modes depending on the input data type: Sequence Read Archive (`SRA`) and Synapse AD Portal (`synapse`) input files for analysis. Depending on the input data type, the mode can be modified in the `config.yaml` file. + +
+ +3. `kraken_db:` As this pipeline relies on Kraken2 for rapid taxonomic classification and requires a KrakenDB in backend, we have pre-built KrakenDB with curated list of Virus families. The custom VirusDB can be found in the link below. Once downloaded, please specify the path to KrakenDB in `config.yaml` file. + +
+4. `cellranger:` Specify the path of CellRanger executable. + +
+ +5. `transcriptome:` The CellRanger count requires a Human reference transcriptome for scRNA-seq analysis. This reference + transcriptome can be either be built using `Cellranger count` or can be downloaded pre-built from our Zenodo data repository. Link can be found below. To build the CellRanger reference transcriptome manually, please refer to https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references + + Once the transcriptome is downloaded/built, please specify its path in the `config.yaml` file corresponding to `transcriptome` key. + +
+ +6. `scripts_dir:` This path refers to the scripts directory present in the base directory of the workflow. + + +
+ +## **Important Note For Synapse Data Analysis mode** +In order to download and analyse data from Synapse Portal, user needs a `synpaseConfig` file located in `~/.synapseConfig` directory. This file has individual Username and Access Token in order to login in to Synapse programmatically (Automatically taken care by the pipeline) and download the relevant data based on the user input. More information on setting up the `synapseConfig` file can be found here https://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig + +Step to setup `synapseConfig` file +1. Check in the home directory if `.synapseConfig` exists. +2. If not, you can download the template from https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/develop/synapseclient/.synapseConfig +3. Once downloaded, the user needs to update the fields of username and authtoken. An example is given below: + + ``` + ########################### + # Login Credentials # + ########################### + + ## Used for logging in to Synapse + ## Alternatively, you can use rememberMe=True in synapseclient.login or login subcommand of the commandline client. + [authentication] + username = YOUR_SYNAPSE_USERNAME + authtoken = YOUR_SYNAPSE_AUTHENTICATION_TOKEN + ``` + +4. Authentication Token can be generated from your Synapse User Account +5. After the changes mentioned above, the `.synapseConfig` file is ready to be used. + +
+ +# Pipeline Execution + +Once all the configuration dependencies are met and paramaters are set in the `config.yaml`, follow the instructions below to execute the pipeline + +1. Activate the conda environment: `conda activate sc-virus-scan` + +2. Once conda environment is activated, trigger the pipeline using following command: + + > ***snakemake --cores 16 --configfile config.yaml --latency-wait 60--profile *** + + --cores: Cores to be specified for the pipeline (Min: 16) + --configfile: Path to the `config.yaml` file + --profile: If slurm profile available, specify the slurm profile name -## Handling Multiple Input modes to pipeline -Currently, the pipeline supports Sequence Read Archive (SRA) and Synapse AD Portal input files for analysis.
-## Important Note For Synapse Data Analysis -In order to download and analyse data from Synapse Portal, make sure you have the `synpaseConfig` file located in `~/.synapseConfig` directory. This file has individual Username and Access Token in order to login in to Synapse programmatically (Automatically taken care by the pipeline) and download the data based on the user input. -Additionally, the user needs to provide a ***metadata.tsv*** file (present in the base directory of the repository) which consists of Sample names along with its Read 1 and Read 2 Synapse IDs. The pipeline will use this metadata.tsv file for downloading and analysis steps downstream. # DAG for the pipeline ![Graphviz Diagram](dag.png) @@ -38,13 +154,44 @@ Additionally, the user needs to provide a ***metadata.tsv*** file (present in th
-### Note -This is a developmental and alpha phase of the pipeline, upon completion a Python Wrapper will take care of every runtime parameter handling automatically. - +# Pipeline Output +``` +├── results +│ ├── kraken2 #Kraken Classification Reports +│ │ ├── Sample1.kraken +│ │ └── Sample1.report.txt +│ │ +│ ├── cellranger #CellRanger scRNAseq Analysis Intermediate Files +│ │ └── Sample1 +│ │ ├── filtered_feature_bc_matrix +│ │ ├── raw_feature_bc_matrix +│ │ ├── possorted_genome_bam +│ │ ├── possorted_genome_bam.bai +│ │ ├── filtered_feature_bc_matrix.h5 +│ │ └── raw_feature_bc_matrix.h5 +│ │ +│ ├── count_matrix #Final Count matrix result +│ │ └── Sample1/ +│ │ └── count_matrix.tsv +│ │ +│ └── kraken_reports #QC Reports and Plots based on Kraken2 Reports +│ ├── Familywise_tax_readcounts.tsv +│ ├── Specieswise_tax_readcounts.tsv +│ ├── Clustermap_Familywise_log10.png +│ └── Clustermap_Specieswise_log10.png +└── logs + +```
-## Contributing -The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=maxplanck-ie%2Fsc-virome-scan). +# Results +The user can use the `count_matrix` file from the results directory to integrate the downstream analysis using Seurat and ScanPy +Intermediate, Cellranger barcodes can also be found in the sample wise directories under cellranger directory. +
-If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). +# Note +sc-VirusScan pipeline is under active development. Please use issues to the GitHub repository for feature requests or bug reports. +
+# Credits +sc-VirusScan was developed at Max Planck Institute of Immunobiology and Epigenetics, Freiburg. \ No newline at end of file diff --git a/Snakefile b/Snakefile index 6bf420d..e15be4e 100644 --- a/Snakefile +++ b/Snakefile @@ -4,8 +4,8 @@ import pandas as pd configfile: "config.yaml" -accession_list = pd.read_table("metadata.tsv") -samples = list(accession_list.Samples.unique()) +list_of_samples = pd.read_table(config["samplesheet"]) +samples = list(list_of_samples.Samples.unique()) include: "rules/download_samples_or_copy.smk" @@ -13,22 +13,25 @@ include: "rules/kraken2_mapping.smk" include: "rules/cellranger.smk" include: "rules/extract_bam.smk" include: "rules/extract_tags.smk" +include: "rules/report.smk" rule all: input: expand("data/{sample}/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), expand("data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), - expand("data/{sample}_test.txt",sample=samples), - expand("results/kraken2/{sample}/{sample}.kraken",sample=samples), - expand("results/kraken2/{sample}/{sample}.report.txt",sample=samples), - #expand("results/kraken2/{sample}/{sample}_classified_out.fastq",sample=samples), - expand("results/cellranger/{sample}/{sample}_finished.txt",sample=samples), + expand("results/kraken2/{sample}.kraken",sample=samples), + expand("results/kraken2/{sample}.report.txt",sample=samples), + directory(expand("results/cellranger/{sample}/{sample}/",sample=samples)), + expand("results/cellranger/{sample}/possorted_genome_bam.bam", sample=samples), expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), - expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples) + expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples), + "results/kraken_plots/Familywise_tax_readcounts.tsv", + "results/kraken_plots/Specieswise_tax_readcounts.tsv", + "results/kraken_plots/Clustermap_Familywise_log10.png", + "results/kraken_plots/Clustermap_Specieswise_log10.png" onsuccess: - print("Snakemake finished successfully!") + print("sc-VirusScan Pipeline finished successfully!") onerror: - print("Snakemake has failed!") - + print("sc-VirusScan Pipeline has failed!") \ No newline at end of file diff --git a/config.yaml b/config.yaml index ba72732..89116d8 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,7 @@ -"files" : "synapse" -"transcriptome" : "/data/repository/misc/cellranger_references/cellranger/refdata-gex-GRCh38-2020-A" -"local_files_dir" : "/data/manke/processing/momin/virome-scan/sc-virome-scan/data_pos_control/SRR13114612" -"kraken_db": "/data/manke/processing/momin/virome-scan/sc-virome-scan/db/t2tDB/t2tDB/" \ No newline at end of file +"samplesheet" : +"mode" : "synapse" +"local_files_dir" : +"kraken_db": +"cellranger": +"transcriptome" : +"scripts_dir": \ No newline at end of file diff --git a/dag.png b/dag.png index 276925b149664d477af910bd00c25d861bcc5211..0b129992c2e72ad7a5a71700e17902e495a48b4e 100644 GIT binary patch literal 24447 zcmceeWl&sEx2DlRaCdiif;$9v2o50x57xN5TjK-|?(QxjG!O_*f(M%5?mma_-a9pa zW_}J;DN@~icJH(8toK>(B3eUD9vy`Q1qKENT~R?+3;5ju0|V!a{08`69}d?T@C(*e zOI{kLc9Lul28IeoQC3RF%k(H0F_lyRcZBh$3Za#i)t6Z=UcMYo+sv;O<~H@g?|%M7 z=LLR-!vFT(WOUX3I?G+W4)GJXFg06l^57$R^5B~}R-%4we8CCSjG%3X);=LvpX#6d z2&az7GPQz74?CWh&|704e~XM;?RKJ3CWAlNn|)@cOA5W7iO(DB&fF=QD>>-H>n^L= z2VLaXQY*&j;o`F8j3fU9*>L$SH4jv;ptofAmz(L!SIkARnpez6+V835(cMb?PX+uw zat4t-M{t^8MUYLxq1|{UGRV5famkjJi;g0ifi&60f(kCK*EF$>{Z7c6D&KFILrgCD zXikelwVwvaS(V-Gb=)<{2^R<870mMT4c=pKTGme*Ipc<*kt(V>kd0z9ADyZQy5*4|xe}9$!t|efA(j1WK}Vba*e6sKOs#iriwaGTW{6#(s5kMK&KH6e^OELZ z_&lnAok9bXO#QqB3fgsFd1m9w;svDlFH2;8R2dy9zu)~5 zE9ic*d9Y%VIqons2$iMKU?|YXP&3)$G#U?4rz3Q_?&+pCJ=RqMT7mR#B)u^ca99ByK+R{quSgA0G&+&gzkUG8e$KR=1IIIXWq#y;wm;6tZyIZZ`a{h25gw$CJp3i;Fk= z-Z;kKDb7DEX5IHQK~V8oWc}|iT|WDse)VDjC+Y`eJNCv%ig+HQ)$>~a%2UNV0gLD9&}1{ zeZDsw^h&7^Lxh)nW^IQ3rIsrEkns#o?d*3BHtW6qgS~IiNv2N!4l%p!PBnwJmG);! zP*>YsWOg(8?Fsnp7vfr-*42O(3V!8FXgAwT?*dJKf=YI==vK-ys^t`ri+Dtle%bsU zty8JP?9w9!RT$xFtTt?k>Giv#dE~Pk!*ijPcDA_QpI5$R(ka)_X|~b)#witou<0ow zAu)|%9J(&zwoA1%0LG2hvp-vf=G*_8t={&veR+OLrIAC4?Eje~l!C*kreZr&Of5N6 zsmoev)K=@<3wM|+;%PHgAf;3PQR?jEV&|83j-ZPyl57Im*Zuh_`FK)+(q8`u!#Y@F z(sy49Xyjt18ZE~Wj2CN7io)LFKEnt(=0%{B&dgP4Gs`ECZAJ{DBY;6WlTV?muu=&j zs@d;Knyi1tkRC3#InjtI=Zp6cO&qUw$;XrM<2W2H)=`RipUVct^bK(|o3D1d&$T)j z?XUOsof`dVwA>qxC4}RV3^-U|cX7qzFwV64nJLxdd&6+cC>@3@A-Ptpi2Ds*JR;w) z&5>P;xf!+5Vfmdo{AIxXrK0i6^^wt1qh;(ZIV|$Wt#uptFKg{CwC;ARnM?DmG-Dg{ zMx<}WlZO8=Eusk95`Xvfrrl*LhMJ-R6?)S5QtGnxWBMzvb@d`l)?}`Tk?O$JuRO6L zxmbcH81FbDZe)$$${nC>vRWO4H+Q?!MZ>bBVE9`^R04e`{oN6!Fw)brt&u|Vs7Hkf zE)?p0S_he~jo7g?R_$OoMA_yKe!(`0QPs#ZC3~}Fu~AN_cfhB^Pw+mbofE+VTarzy zKN6EN#_eqq`8AbTaE^$l8UlT>91_B2y->S z>)X-QM>E14t@U(8_1ewg$AJ{6xz4H3hg#ad{l$&*Ifd_c#Xs+#Nt-QGkL~1w9sW!x zfQwLM2OJ#L;N`?Zv%UcQlo>02WiBWRRQsKHA=t$=>Mur$vPu>>i^_+pMN)on8rb$_ zz)=*19Hoo>xT+0-#bod$`hm^va$(#TU?MC2kVA$m1f9&B`!oq0a!5(0=BWeq6dH7T z=~&#-A4}kDV~%hNo1)7K2{)*Eq#WZf5tuQwnnk_Oqtfu@4xMiMX@mk~Sil<*obwY@CWUt}{x?4v1zF6+TEczR% z?HkZke!zY78%zqef=C!Hwhr?SG#LgOk`Jn);bi1*PQlOL6DiHy=pwA6is)sbdF{*` zrrk{GM=*E=BYp58WPzC2G+~*=z(a5H?-128y{rljY+Ykm`R;Ihm@a-Xf%#Abeigl9 z3Il$?K>QX5H1l@L8^JG*UKW8A`0!6muqpMG%zo7MCUq{HPzd&i4)Dd0hsx#O{1saZ zZm3mX*Cx|ug`!M*oG|7`cp5)e7|nNTPd8&5O<@plRVY1j`X@;IzD@%4z4v{jPY5RT zcP|WOcSJNA5DzfFI<(pRix2Nkg~I&&u^T7mPKD9> zj4CeFaYW$qJ933l;-m{lN5iv`*YGr$jNHfGLg4=Lymi+|%^(D2GreZh)NGwz{If@W zpXF)DiU*-i!ape_5F`6rQ}7V+SU{BDY1)lU?UyG)5!b7J4@QgtFe_&Gr&8AO<#)b9 zJAzYaPthh#&UOkgJAQ8@AV&}QoY(E%$dwtio2&npOY_v!(B7_rS-UtY1dT_#qLDu> zEK@4)LBIUkKozorw!L#}qNqif+WPmUpE(XH5GJ<4lIM+J2AR&dj$}3eJ#;Tr#(AFX zh}wbDKMmeM?||!kO4{;@jhD)RcdIVw?(UAJ`9AUS2)8>feD< zWsc1Q4)`c3`Wi7!PyE}>aceM!$k26x?shZ6LrCk@pI(O@TYuO+o=$(8$jz#tFM~qr zqhs-IJLp;wq6J(ZA8M69PK~16#f)B5TkR;?|5?QPB_!Uh+v0D=rXHkoY8}e`IW!}44fv8U#G|QA(BeT= z;7Zqe#q29ATy?T299b>*!>#wrlhg7&+>E8$!^fOW)^2u`z{tcTm0N>Q&llqJeyW>) z>vrj;U-$#}{w=Mo;Hf)UL&UBMsuvT<>N66^J2`(^K%M~glVpkOkWRG)8?myKMm8FX zHt&IM4k11-7;9}YK@*ap_(FwA`VMcx%ziD{3`a0)7XPiV6d`V^!X)^ji?tJmAyg|O zXBMAWaBluhL=N>LJ{NH+FBGi)xYTk(LywdQ zXiKmEv&JNFDUWk`6*|yXE$c@fCTrl*|93aMl~Mk0{peca0XkyUeEw%vZR&`$?orqG z`6uqdi}g>O8>E!`L^kv0uU7LTA~s*Qz1a^|>-Tvk?k^O~KSR=NQdd5_TJ=!ah@tdjTJDxV4C~M9bjh zV#pQ@rVGwRWGHleK#L36e3n1W%O;o&d4yVya~5qosC0s;FYrTyvakZ4R&IizTGxyjaDDtb~o0F{AJ^vjyaHsb(t|aLz43 zd=9nKKju`PexFr1jjKlGv_?N{9Wu0xzyFH-f}s6Zp;mW9eeDIDuy!@Clsn=h&L*K8 z(3>Bghdj@#jjTwq6^TinT&k%G&-j;997eCDv6AyvrEOn~wW8R&B$xGld7nJ0x%GU047@7UQ z#_qUbaCStg2mUy6heyqYF)kq2FjW`(!ANU}XdQb|K8`TJYV}o1f|9&0PLQmaKq9bO z#mW>wuZxpIs4y>k1X!(k+H@VSdnAKQ^Vft0uv&jHCmX-+5#@yPoqycGLf^qttbN@h z^zw7A7eHGpt}%3rRPfN=-d;R2yd%k!wp=6JfHC0ZtUOWLUK)h6dw#U?t zEFrm_3eRwPlD#(5Glx8}$wO5`m@l=J4x|22Kzc*F0hO7?uy!3TKW)3q=05Kr5+0dy*p2)EW8aWdxn5cZYwPDlEs-U-eg|K~rfk5ERnqM(9y3{W?bD0t#BYr_z4y%`96-? z4(|;}l?s_ij4%a92`GGfm0sPt9B%C*30Ii&-RjeTO8kbFdlbP-EJYfQ)uG=|OQ3;Sx zy7aY_TpwX{t3IeongY5li_1LZ_UYl~V3tq{D$V+Oqb_6o90yVN@r;?L{NziAqJp74MJ8 zmWqbH%0)os#Oi|MhmcD&H2pyiX$ZSL{KLy)+T9N0aj;NBBM>17hrtC%>`W|gYD$TK;@h(=Z9o)1 zNhMsG4}>7ntp6zHw;W6TG>XZLI8)?vwcmj*^yRzkMP1+Xxqzm}f*4u{FQ)gv$+2`o zsaU{cW()y43zcjtpnnQwAPs|XkXL2@cQCw2EmwFtm026%vNg%rn<%W0y3m6E4K}g2 zuogLV#rxtP{uPH&Yn)-Tjg;+dsk?re1D(91X{VcQ(&XdPgC%(;W#vtvy1BT(#Y~40f{|O3`0|?z z`!A)amn6VaKH&}6DIflB$dz6zlE)qrj7{&*W9pQ4>H_3$3J7hdF2?qrAqjEEC~P~srmo@ktBAg z=Agl~@R{)4buN>R!s^d}hdt8aD6A~t;ZR{jj)C_Oae8s#Dn}JK*<0Bg*_^ROWzqbppWxZxF5fn~ z$OUEalq{G}B<@bcIO$^?b2~HvpA#-PWtw$8;!NawE6vT{wvp+SYLR?3*@|&M>tJ2u zX*|4QnCIA9DMh^X*quZJR>mxq%+RNW-2q^c;W9P;)BOpsl` z#<88T3~(npQ}w+eJIZ8kK=MLeH=9n&Nm4!vfpTTIA+I|Z)XHbQ*O3kX( zRcZ-Bbeunt|K20-{-Sp_7rWVCU^3bk>%IbEqg1c&fohiMzk03;=jhLKtwtNx7WERE|!T+-)m?OkxOsDlbSaQLm`Jhs(0ng?g25VNG#$0{pGsM z2&M#~Uwk zs++}s*Ur`Cp3$vyxdUy?{$ful|@&c_h*;XgNvn%LdoD_T)^)EUEGK z4jZ2(FLlZN*GVG=F6T!`yxa31?L=E=%<*T2J>KDO!A0?R)vB@YFbzb^|D@m8$KBT(qIN|`Q7=N0)aJ=V-16UJ z{AL~7{t4_M(no`AZTMWg`lt3zr6V+*GED+?T@rUO8OPMJvw(Nk?}Fp946~9M5*>Bs zv*tq+h4VzuOPYy4&2BVE*q7>A%b|5lVi4>%WW1$#mPEl*l_vKg;3_nnIH+IEn2v2t zwt(<64yaL60MxfLxL!JV$_%654|HA#=wR^k4>r9Jzw=KcDxq3#!q|VI0_+N`&tZ?jOYTS|=|Bj43 zA?XCTYquUIHecc*?2X4VH6UF$aptS@6|0$9DH$VC$def+IGmP7Mh#nVALvaxZsR@; zM=NOW=*}tCO!+LJ zsWYi;;g`5J5yq^$=@Sz2wV&mXlnZ(FVX1c>&@Ec@#dw2lvA8cmm<<|{Y9$IF^L5CUSW$CVS;uQ5{n zhH8l6QMzJ=r0Jm64(Wjt0`A%q49xiJ7V%?VuTm?GPtJ{ms}TWEm>+b#iBU~DZgcM4 zR#;T*mxLM9DybIRDm=t3AT2=#O0L*aP2b?jEQ$q?4=EasMnv9^no=f%virIgMFmLKv>1HmQ(oqVbo*mTSrx; z_ARn-4qlDi&S_Qak{5S<8a~WgXVctJhml&ne_!`Y)MhGoRkN9(3!P_GC$&ytZ?Eg>vM_H`LQ|hl!EOj+Loyf-O4Xu;mivxNWWT$`nS!V zsxLa7rb7f$7x}kvl`&UyCGkAxHEmx`!QW?4qZ_vBWHACvJk;OdKpuj$)}w$a5qMdC zHn6wJskc76eCAoBns8Z#89q)pQDuU~Aghp-Ie1602ONBbO^lS$t!idE1`AL6WHJGN zd$uQp2zDq=moJlTE|fL7WLE`cy?gGln5Prk%?B$&{MO>Mn;nPgUC`gb50@6;DD{Ko zQSl@E1|w3YtU6SDx}mT=;-|^UZi-TWapl3|f`m19|90dT#e22UJC4MoCzhavW@E}7 z>Yp~i>gs8$asr5bn_drCxGGNmn8U3jKI`HqIM5w6u~Mm*W7<$X$Fu#KutBiz60fxI1RI+26EQfrSApLkAug5vs(RuVnBg0Cp>WohS388Y#w235*-)`5!NUxq1Y{z@G<`j2khyU%#Wz_T0Xl7Q+tD&qpnZj4;$5l^3)a&cT?88r| z;janRZ!!<-7)J9w-g*rw|{Tz6*b&4+fckCszebXiCQovCzd?7X*Uh@n9Gs1Qge z#^70&!$xfB*F>QL9vi+3~qTyG-jx9C3y3&GAB`-C;O6fC@^HFex-< ziWP^i>gBPGAiyge@28oyOZ5ZS`vUuj5JJch8}Ef>GjT8_uzZz*6XKOo)#3fXpVrbMUlv!c(-vlx&> zSBN8OhZXm~r;`kLxUS(xhv5To$hTKD?xXnzi_d0CGENW-XxnK3*w_LI@XeRy7JGT% z-ilYdJW(OvDEANxYK=P@gHSzQP6A(WH;OPGd$dZF`0N+hU?riD1obbWYPTEzCbAI% zshoP9|At6Fa{q{Uoqn7SSozX^=Zq_(HxQCuZi6cHMaCbKQoK3AC+HqXGk&7moB9S1 zYZ`=}4df149Z}?;lGu&jz3TJ!Hvxy;j>O?C;Bk70gTTwrtXcV-1-7q(5q>8FGR-|7 za9tv;e^Hz!V--3AnNm5ZMlmq0n-bXm^Rl4bd~S>PK>-8h!I#KMz@jLJ>j!pMj9NVL zY$NA2kfVA&$RM>AO*IjnD3=-ect0BhKgMFyD^!%fA0djyiC z3-(wIW$O7Hg;X<%uc@_fhNCCgB>-rW1P~H7m{zIIz~ax-=m#K2Pxm221L+Vn_ip)| zQrr(~n(#;`1onL>FfOx{fsf|lqFlb3$OW)UDj&JCq`>J^JTix_pxd1v@uaWj1pvw6 zx7bWe$GsR0I02k#{Px_GGEFX^)4zx3FDXz^qZ5KX35;f@lRvPm6oh+keK6&pi(m3TmP45 z6PU^sXQhNHXns2u*kTwah*Hca^)-Rb1XVzvDN}bsoL+O3cNgrOCgdexc)c#-^0UQL~>zCLB=HoCEBoeG~9j!*lNs z2d3wp6PjPGN3wiMU%lXF`e)GjlU67|vhUquc)|*w{qD{yB8*i#LloN0hu&}t|87G+ z(nj?LHmIE4N8gl9pZ=I^451!gh$hC-@I(bNjdQ#6U$`)`ZLkF`{{WN-SDjw0v5%Nr zUC95wIdfLfh*2HPk30NafsF4kJ4g?48GFM}V47A71L`xnu`$ZEN@Y+b;U*r!p_xIF z<*cs$I1D#L|A8GapwJZxMVtV3ju_EafU;H2;QK;i@tA1}@fKWR*)9uR=iYL>=&vRkl6Io1|H$};LhFo!L%_nA)gArU=qzB6>Fa+@#?ON=k0%{ z1Ir0rQ$e~_8rK72%<_L`IHVG^vs?_D7Nrshn|;FK7-+(ehNwinhn!s+rX+1A%fUke=!?@mDUDME8u9mX0Ia@?+~{oTgv)Pn=+2{%xU_`C zBOp(|RZf5*9S_n^hF^gU^_XAFo1&z0rUgf8fdkJlPi$*3?F zK-=)sv30WM(pcMW-D1WX*Xk==e{}+DUiM}M?pu;Aaf=`2k~Kz55kI@eZ#is=2fT1( ze~Vah?HZxKzduI}S0+fpxtcq^;E^HyYi`-c*mBspt+LklA{}2@i!~hcoReedIeKu2 z1Du@Lpdxl~L$BJ)vck=A_qVXXHqj7&KN8gd<~1w;7|X~Vhe)w`=$mO~r%1;*dwSCysI zz@l{&QGDfnqI1yqHhAQ7H~|Cdku z2v2-rO{|o|m-U65F<$Jq5UPlcl%dB@8G5A>@Q>C&Anisb9Zg$e4en{YDxbbS z{OuQ78ZSXpIXwQwt3s6r7%O+kgQr=!CjaQ*>v zxB=ZHoho+6JJNVm=$nZ*k1jD6l}BFoXgMdxE1y46;;;|Z;Mydwbr3rKD_BO8U^$6wY{5V>Ji@J2mbl)CO6 z{PFNRh%#Bs({Eo$e{7Kw7#1oC?@QRHwZ(Jpekbw^4VSxKX#p|2``#xNRl90D4?L!< zFyYHoM|q>nA7CU|l!btXH)yuMX}W&zBo6l>ojPK!odqjAD!oobd+S5=De$!JxxD?x z2|UqEAEVR}&MFE3+EApub{9|@UsZQ|K$(M|=*dI%TWXv!;InYxzvvc~|F#I=Tvxa- z49Mpi8H(dpn{wjLKkAPCLLdeqzV&+>#u+lFv=|W1mw z0Qcaz{_X9{aR<*wows9PgU;0^!DNyB%LVj>=~>1TI4HGk%r2i@Oe!ah?L@t{e^={D zF|!3`EQppNtq@vKysRGuZ1LDuCraV_+zgDDi?Q^pMVo)c7MqxXkGctdII=E27MU7B zv-rMcBH2OLlm^l{)W4)ZcO^c=$)zzHqU~K9|5c#CDo+_9D_wmF2zE4jzj;EW+OhM2 zqf-*t)7b9Hk5_o|AbGcq(SQAcZM3aV!1p})+W%cIv%N}=#pC&mY}l>2bn=n-JV?5r z=&eoFq<4k>D6~{om6%mdJfYk8{6Y`f-^4YN=|D_!iAPCMUP&0RPIEN0*UW78vJ0Nhq3FTNBtIoNkD@W> z@t^i4Hhq=LvcILNvt=1{%eWEeq-NtOO;)jR=J%eZN#{cU;F6U1<^2m<^KxdT_-f+& zoA>EhY4kQ-88pQ{Rw}~`b2uW^nC@!g10`8qtxGCxPThrg6&5q6(S7Z1HNoL6fY;3! z2mWZWuhP|G7JRZAmFkqUJ^oa{lqkMR?MWps9Jzz?Ur@c|G0LDThioqfXY^u(Nfm$V z9R?5EN4OYI!;pQWNL;6z0whYOPDJmmeb!+lM!_Atom?fBCHeD$H$uV~V5E6WI?>>6 zk3aO6PhFJT%;2cbOIAAOEfRkSSt@NuwOCo22@r^(hu1e?A;`yXL0lBWX*-I^Eiw&` zrH@aPpLm!nqE8&l8xdL;0fuwR(bb-7n)ETK_Lq9=;=d6VJ+W84a z;kjw{C1suQO(a%LvH09j4q-M3z9%6lXE1nHkgA2$sA3QAq!7IY za28-S-bYYpv{=VA0`V|oqPR*|+iEcopUmdy&d2jC<^tXJ>?TO91$x#Xa~=D%{2!Y8 zPdD`sJzB@l9t(S;hEK!gfEJqr_?oJ!2jTgDKMY1ud;RAr&$Y|cG_Ma|$eXYMpT}4x zH!V6bPnc5rW#Dm9em*6=a(diax3>%?g=hgX_G5ksnV65;Yk@+ppvz$XBv4PF4ftta zEf|-FU$@lfek4+=1D5?qSg=wW%QWD{&JDS^I?zMGWkAan^-czm*1zWNE=0IW>)X@6 zg=;e%FQjo8KGm zt$|{Vxot0he|`8zCLHx;rvRwV&;$(5h|pIQsG7~TGtC4CF#=2s09PvkNSP+U z>2?8NnMx%EXSdYMJdq>Bs{vtD&5{LZr7HviAcYD5Jke+ky-@(H90<*TO$4hI=-9fC z1AzK$fVMYFWqNQK)rx^uV&C~cT*m{Sr*|9pQ79K1vu<8SBcDi70uUhV_tbIv3i7}l z{H@Oe;381!GmAeLp6v_JhQSCx{$dr1IdUQ3JfbfGMGFbFraejkMExe@`cESUkNL4> z-2{@oU@Ygf?w9-&`nGxA3j$=GRe&;*f`L$^S-@Q(7X#~kbDRr9xu*)4O%SmF977n` z6iz02b+oMKMiJpCdm0Rnq*bPt`<)3(zZ0fK!lsRJLrhXk0?a4aRI;B#b*lA~fT7rQDhyY^ zfYcdxNCU>2GQhAOwQB^ph?1@aLsvA_60&R5^Ti9*ej>h7AIL1M&B;5-*QUw-eeB>;ddiYW&KwBD3s00r}}5uPrJ)5C}RE#ZdGBTA78 zfm{Os;0I8^SQY;1R?PKBqdkjku&8>T=w>G>AxD%=1m0)BS*1qpnVMozEa_8>Ok?+z z$MU;sgr;;-KUG_PoVedD;Q7n6pt7UYlHp?McU+OpMEc0{h0~6TlGiG(eZVD!DI{B5 zyHur6909wSRmqi9JRcvd}FGC{C|9H^L1(Qv80YxA*HvpJujEadWouuh#aznhL#lL^LS(lTQ070Yl zHG_dt34p;en{>V;+zg?^)hB7uD<;cAuOMA%*|2@I0>HsFT_lei62uE}IMSk$UqZb~OHz+m(g0cc0i@VjJqka46aZf?9T5jzzU zAFxFTS-Bui2a9!_=l;kA!^IdIJ3qf_KdO$79#P+%onbo+KoB(31@R&9%6eq;@Z_yR z9ZPOgBv0?ZVC-dtRh}I+7YE2n_;tp ztTi+b=I6U)+L%7SyW~&bqo_Y{%}E|JnG8(DD^mSK3A)KJ3!pBB*QXy476%zv=29}h z`QZ7niU&%|ULQ&38cikyic&x)t@==-WYY)NarC_j;JlwMrL*QbT&;sc{l%iXk5xpk zLi~0l1Z?)(X$KcD#85SY3$;$NtjM5a6J+9kUfeou_KRvTp(T99?E3$JLpiO_mjYKx zg{h%ALa~w^dOaY84ozUlgLwc>PQdGR_0e748#(+O$WIgY0VQAQ=^Q3344w)-Ts}<7 zQP*Hvi68Mnpn7khFzS=T`a4f~w-MF&YxqyaX6<{_*H?l@(%-KDLjP#~R+AHQc1^t| z9d=1(oZ7gPejWOjd*i?TNEAJ2KeJK4ict7!b&25q3J=UJomXbB`-4I8z*{o$@z0%} zmN|s}hzNMdBZ2&`BYv$@;l6OFpKZPfB=P@rz5<@{`~LWR%0Oo4b{%snc-1pZ=3i~C z+TWqB7m`p|Q%eZCbK{(DJAB9a8=8$b&}38@4sWDA$2<}6&3_RCjv2gS*8k&W&o8}6 z$Ls#I3^|!zN*rh%Q1ap~?_LU`(c(-R-GEv1oa^VenpXIy^fe-9Tccq__2Wg%>XaKy<4+i|6|Gu(%jN@^fmY)EG1xduIk&|(5-y_vBqJ&_?*yPnB@G)3MjNR zp~Y|c&qm;%3P0U(HH+lfYq0X#N12}w_cSy;YuO=%1l@|}pZa)<)zn!v;qSVM%aE-R zrMfqk)rRSTc=+iUM*WG5t~St904JB+$~&LdG>I{_ds4N|R<8`2ULygQJRs&d{U;^y z`!DSsy<+`_5;O6ziY3Wo{vI3fH?KDM6R$7~z)#IyPIuug%;8-OB*iZQ|4$4KKiP!g zK0YfeT8FaOp|rW*OVkM!b(&pqnU89Tn5U_Gm8s~9B>RCu>%@2}JH-{Lsqs|{10htQ zRvzgiVf8!o#aTe{F!_Z|@%fuuo;f+Cr^r)Io%r8GF?<}1jxNgpYL)fm9oF4)pL3jb z%IjvC0Ao_FR9z+7beG*3q9NaM&6qiE+uQGEapfPHgqH+XB{5I>N?wgXv&W3af^GvL)xuQ+SfK_49w<2>p+(*3DX+3t?a=sbBvt06_xO3M&FAhfprpp#EMI zAbOsm3Ap8ySqTmD7JO{QF=mNDyc@#ZFBI{2p2LO4^UZS_GoDRpzg9nekCP1KsQw&L z)m)xJy*LQYT3b;9>uAn{! z5|#bNu$-1sF=;AlHNw904_ntJfC&Vyf?#no6va;;b|6b=VRC|?n;>p6)Ut6DJ|8&l z1l3cN10Tx^tG!6N zabP2P_pSJAsg0(9Ol>%!*nF@9YrQ{WYeRo_w3o~@TZh{-^VLDOiH=#Xp|IVQXtqJA zjV>opiNPo~f3b60)C`!pzWw;AJzpRGrgbLB>u;H3LT51b4W3#}?MOQ4w;_hn&#xVe z1&s!_x2NB1kN5Wt`G2VZzU!}RZFRmYQ%`QrCnjgsen>GVQq+*dhSGq8d*w6gW&$`9 z!bx#_O5KrYLdEjO=hZ~J3=^7X5$iRCIscE??3Fx5NHUHdn7KUiu;P@yYa6ieCgDS{n1&K zL`%Mex#Et0cGr^FR*j8P?!Qob7Efi1w|Z=1MHu6L;?oMj`!v07y$fxKgtdG0_10Bg z(|3C#gw`*@uV4)#RDins4*wPeQ!hrTC(gmNZ(sZ}zzjxLF?dO$m$my_Q69X~nt=9o zYbHpOUbJQ*4samyb~_sFln49n0br_ff(!mUDrqokTNZk3%7oVOCpQAmQnHOaED>)G znW1RA(kr03fHjGX)8?-nJ0~E*BNAK@-DL`9y3%FffAQ^tGlU&@U>#G| z&1)VOPe%GCnnQhqfL2u3vYIbfVN``AeGRs6dE?Zam@_xB7t<5!cyq{XK#5K6i=C3b z@_drg9>MIWS!z=!*4lJ1TK$2XxTvS}J+q^CwYECq*>YQ1y|{SohXL@i@8SFGcbD6H z+Ir*_zIQ44DN|#ZLULlmLnehdf7wZ2{#P?+8cx-}_VG<*EViMP$R@KUGfC!c9=3T5 zQ6kgMoKl8uW}9c3r$WhGD5W+^=5eF4l_6C4D;b`9pVPVi=f(59J?DA1uC>->Ez4Sd z-}}Bl-){i#m6`M+^I1O327ay5m&C=U7~Wc5lRv!J-Ow$lYfrf7U4X5o49slK@1QD; zxY#J4&=IZE%Uxb}|H@41{VSFUh%L3U*p9&4D2IH_3y9@VjWe1}q3ziS}*zyz#taZAKsqAOxpSDzaZ(id>Xz+RT z6<=I!VK@9K{`sK?XLJt9PWIs4w6;20pHX1BvQJlQ<7^^p900U~cq(YWLRLO4~un%I5Z?)v*n z!?l+$h4hmx9gbZwrCI!U6(EkHh}&8#^Bk%xn)S`~i>^HTJHI3%4sniF);y=DqG~~F zbJ@JbyR8`U(u>(zJ_|UH!rb2ZX*3+X(2s)d%~5jG2KWzk@V5WMF%?Qh_i9bHyBdNf zHh?pVnT0}Ahm6a>Nl1I2Q+E}B#*w`FSvRu7e|a=s`o_4@o5YjlWWEB*EvWl-hihyv z#31<&0=Yw+^=;NVYul>!E?cS?ItYW{t=A{Jug2cxU(6BE%~S%j7E;C)4qpck5Ck2_ zK1hJ7yq^qBL2rOf#>2*Sq>6o{!YB;74=c6#$0V*s$$uSUAD;w!hY`9+ExN*Uw!JeF z58PI)8?d^!x5bb$aesJ@&0%ZwS9B~*>B}OQKxQ!OF+}S&yt7) z#EB1K)q>^sU7MM9Cm0z3$5nA-qK+~F8Y{Gjt(7T34Fo#r#JP{3D@`g{;8tT*A`j~& z@u82%uxRj(`)#e-Qg{KT(-qG!QGY$9*#j<2*$%kMAmDJ7kR-rHZfxvqZ!V`Q-VX9E zR$T)etP;9CpJvr#*pPU0aBaDc)#z#d?r(%YYqYfQ%48a9?4D=gV49|f=S#pj)Y&vr zeVp*rXHkXb@CoL9Axy`e+ev%XRt?%pc8?v^ zgZ}z@tSj!iT%R|**q?A+o+r9}i-_z5Xt`-F5TFIv53W1)Aj%i7&Vo0L*Q!WDCw*rv zc$<=%K1%3>Me8RaL39waaPs^!Jz5P~H7GvcV4$y89l}J5eeQfo>s9!186FRo5_)E0 zNz^zVfVXHP+5w{7uu=jigWpRHbpeBX-Jf4NBN!;1;^?745jNnwy|L&+A|P=#g0F33 zNKq;XPSKN)Hc+@W+hH6|i&RU<2g0fe#Uv((erj|L+KcM1UiS2rXr<(;BoMa-iuV)r zUy)dicAIAJ6ob2np}EOpS_DHgfAt&Nk00~Bu_i+22rDa;Bw*@_ue?b(PZC#Jotxvs z+K?VFt7(YIiQuQoMafuE3g+5k48S2p_7<0$y-;A5!%2G(HTktI^sjPCy5KF!?V3FD zM$~(#-XFp0Y9yWgy}fOPrj29F%oM@Xq}PB&5M9rCliB+5yY2(p=J-Yjk57a%IGeL| z?i;PiHTFNK@JUY`n`^BaL?DNzm%c^(JH$ZLM7gNq7?F9gCdK}1Gl#8N`&oF+q`kuI zle47U%sRE11}PS(IN-}I6*4M{+Mr~$*S+(2GSud%%)Q&MsB)XNpy_p1AVl-AcE*|0 z1?D_&e1E!=2uYMKZ{^650cL3EHL7=~2$qvcnh~zyc++Fn1W#hKG7vqitd`E7qD^_G zq6N{|0|kw>o$LfibTQD&mPCrOLgVJB5gV%>e==V+qQ8fQSDwSro&%kF^AyEJMwJbk z8y01{NxTtvkNtpNu}c2^i?{rYm;`*uz#^3cJbd8bj1rUymk`V$pSbS4q0o*R|sCvHL$hIce3 z=_apAo$MB(Hs!)rQZIi!_*WMKI6{MuxjiQYBHNhR-OaUaq{TUGpW~PNC#+NxXwDHb zUWU{0aUb9P{^q1(=bNIw1B@;5ms(7;bqKaqnG6>;5UMo?IR0%VqHIk?{k;IG)`YG5 z7Bl)ve738k290ht|!`yUlk9PC4hJ zj*sRBJVzvuj&Bbwn~sRJjZflAbKGv-zNO%TC>I4?y`+OaJg)QR@bpn>aZ+xI&Y=fS zU!NzLy$7uQDDOlq+2%WOY;#-@e7{>9yc6s~cxXGtz=$06v?XW~L{R>Wl(CFzUlLJw zkFhcb%)f^y-o1rl82A5tM^gz*gZ|fB4KTMR-@Ifk*;u~u_4|d8p!fF!9e4NREI;iT zUdw>lFNX9edy2PEWky-2Q{Q`{#CiQaa?eOq%UG@4$XNTq>D85a4y@;z^7#J7_Y1Dm zUU^Gp^-H-KQ1jHZKFc`j1V+XK)t~*{8?roR14^sp08cn>gJJ&}hB8-=ILbEp$DePk zbMuRVCREyff4uVA*Q)d^LoKU1lN2ftHT9}`JuGiA>^OxA<#so4go%Ily8DhET=S~j z!#pA2lsj&I`I!BaKZbjH*LQxS+j%s}?n#8(Lxr1EE$iR7kmQs)nBey~lSw)V7oaOgE$&qFd; zW5l0qNs)57-_Tl@He^_wE*Dp2*BB&gJ5Yq~o8>BrIIic*=$R3tc$-$(r*XkQCzvH< z(4?chC;3&4o)K@!ca!*Rs34^5ux6jXjU0sGqz&`#6COcZ+2rt|*TDvQzxU@J(&6^= zg}d!>M6cRghJ}=Y3x-At{Y5LK=I>t>jto7)^f~%hsBMZ)ifk!Ojd^-@jlk!uFFA*y z{=t!iHEdOzes}JLLtcDUbpZvRj$vKu`I1MAAA93{mb*{jY+`1-252o&Z?ao-*Wx8e zXF~any)980-RMUVsElf~&7(+^`4=&qP2T+yu`c|B-(ngqc?M z)N8!kDrSyNJ}j8n?7ab*QvERZS%8InxXxLJLXk&8wLN5C|L3*mcPTC@)s#cIn?EyR z*(C9=+1V4Tf%@ylt=@A7SBNdfJ1a098wPMW`-1^qU%k-t-`fdHN)QeeN*sm~df?QO#eE3^ zC_^1i8IO|0lb6VWYF**RS>t&Sw@`QO{c69w8X6H7f_LOGHKOWog3&LxVz7`$I-JM4 zWaRXN)Eypjj>d41ou~;Z3pbDbfXS;dIN+7LTbq?g!@P1cV;i9JSP=M~ z_+>)w$ErMKkDL@f7D{@x0U*Bz_I7_)8WyT-?EJbJb4;R4b&5jHOj+7<=IdD|4K`{L z;xvdIeH1!7<&3R-2Fi2vVOFM?C?&x=rO2+tuG#y>`YrgMLIwOGJHg?QK&Gb5xq+8z z5+1~6$yZ5YHVWC<7r0EN~HoeEp3Hao2$&>QcQN zvl7vey%#YaeC9g*=Wl$qHz68(WQxF26`><_^3h@Eq*%%zSiM z_o+x_!qWp`#5bgIgNS^m48gv^wcxNKt+~q(w!ur$Ir8 zPfTQDKWk_6u<=y_m$g!Y5EK`Vj~S!1#5@=uItmqup?Lk#awV$GNFEAxtgf#2oIPc% zIK^qy!Gm%eXKk7diNWwfu&y5{k$7|kd_s&Zw=}!`(`FJkDda|L!Pz2qd5+7%q`}qH zyI5FMTpO7{vKzRa5YM=ZztYzJC${3TWDLKt*fqy$q8Z2q1tlf|clYDJ64LI25hF^D zN}yiT`fCITinj3sOS;zxBfqqIFhS*_80pDF#T;0fT!CI-C|dLF|9J#9JskeaoAjgs zzTKwjuvDD3{qO_-@^JR7WX3WfFGUcLH0Y(~Jd^BH6uSBjg3~^xlx}BYdujFuGA27%z-_lJfp(iK0U?dwPV&OqH z6l-Dh=t=Mv6#a5@({wFGUG;8tjLOlg2uFR<^q};6ZPpFB39nhMsklR2!vFIf)|hku z$E`klaGHGc+omlbih{O(=5^46_x0NDK1};|4O@yk7OxdnW(U%oHlD6EntxKBR{%U< z#b~-D_)VqeJM^|qFE8Xl9MalXnSHa_*1hD`hCQ^e@$p82{(M!5sqIWJgE+}<=APO5 zAC3z%lG}Y##4tA?$t3L%Wdz3$p8n_}I!-g&GP#g^pUGU9lVHa7BJ$E1RHy+}8qCTJ zX!L0AyMEK^A5LNc|G%S=S&?4dCGI~9f`-AGZMJ#Pq@2?0`XipAzBr9B}Jhji?5}eJ9Q;UVK-O&^Eale z?5DE;>FyKFY0(&eiRb+2%(qM{h}E1`3}@a>_Wh-qGtT*p4upqgj_1V@L zA{Ig{n}QIx2VCSTTEORv+*QQ{4BWeWJi6?CTYmHxY^m4hT<~K;x0Aw zWYhO93ah=~$dR@G<7s)FT+!@5J>m#R4X@A91&7|Rj_r0XHs^4 zKwBj>60vd6yEi7O$H{@PxrF#ln=o7wl6*D0{8Z4yTFs1bq8YQmXiS{u=;YU^KK#?E ztiGD%J8pGS>dHZbB6k{in3_MO#Ghq~qHAOsplFTOiHCp}i|Jpt`%(_of1g+OzU~$M z;jcS&@e71^6s=ld&0Yy0l^j^>V{zrfF&d;`{OF{S#F>oZ)w7Rk+D#HoLtS5;DQtK; zvT%hk640F{jDD@e{x4rAy z7H!P)Iiqz!ZJ)mF;X&u?>;{VGg=Gw7lilWTOl^AOcF?ui3ny_3vPyOjEzXNvZddSy zOdv@a$T1xCqF|5e?8vs{vWn8VH9SKNkVlW(3H3%@CWf_R&AX=0Yi!NI``ge1Pq|SWk z`wju9M=r2`qb}Xqk!2S#i2{eCASNB!a7N&NU4Wb@UV|?_5i%dg;;>J6e}Vbd1%Nxp zY3n3$=N?1wadBW{RRet0MSI?b27`JtDb;3|hm0*UPY6VbHuPi5;TqzQonsbYy^%>R zx3oIdbQT~vV_j*uBLg~?Qw2FPo^V27|2>#+ccg!F9PfAFTtRUIinK`6#zt}v@eH{`a ztD_{86Hk+N;=d$vJ)aH8MwE`4PI`!KAm$H3Fz%p;{!+jk|1$!IwFovDNet9VVHS?TIO-v#CL>CNg%rF94Y8| zTbvNz$iLV`XEW22K^h~qOWizwJW!tdWde}D>uL0_8OL(crI7d$OP^H7$*CC z#l}v1gthX7&e_{hd(rv*bS7+u_xTYtE^69(H98wYI;4aBc&rUFML+$~8U|-$Kq%zY zaR~_2(Xu_I7&hZ>UJx%k$XuMk%cADh#fBrUFs{Y@G6Nn=EHbL(E!$Pm9Y5i z$2fZl-6Mj6v$>i*5~m!{n;gR#i7!rZsp@ zF?@SeZbYoBo3206ZMs>7i$J`o`*LB2>s@tWkg>_fxDpNT-tM4WK2;BTb~VS%j|wv6PJVjU@iuU`Uov`eYF}D`_TUZ*`TyGc+%3!TM*PL_B`OfEg-sd$$Raq7n>me2dLAdgA(r+LL2@`^lsUDz% z-&|%|nuEX44HRUhAvof{?AF3K2%?4Lr6u0FrvF)T`~3E-RqTE?#dF9gtJKICQ;%#M zi}igroo@`kdb?cML+d&(%O|4a+7&0`H?1o_KXRhkU7dLy8|8F|UTM1)V>0?))1^XyaVE*9} zOwLID1m=0swEn0sVG<88UvW?4&ZU=`Rb`56z%~B~aTttSe;JyjiTw3|FkZ4C;MHTl z_fGsob!c+DI(V%-@BUOv_bb(JR4;j9B`kT%6y80B_HGy^34<7xJ`XFg_{nRqICJNw z*vieN|Gg~OAgz5Lg<%sM*PX5Cr=0R2K#^*O4wKM#XcE7<23y?}o5BHOG0EhTj6yL$33Z{++LM=E~Kh&qz4 zg68Ox*GAv5=c`J3*j4plfvDY0m+_=w%<@6OBM(jw!*=wYdzdZ zi%eRje2z01tQpGovwk>utfLWjtkf^6v*U&hWykN^8UIpe*eAn!Lk)ZLbJ`m(OF#@W zXaNUGexw^Z6e|#-^KKLE5%cSn3m6RnYZKD!=(P*h10F+03+zg(nGMxJT*wFe#iyLM zL#ddO&=zi~t}QFsmp9nU7aeJz!{21J`f{`g_ImzoJLgakOvFw2PA8)gouDlXUjm9$ zR>Y5k_Fyd)hKL84`+988Z^n?JH~)I&rpO+AB!NxPO7LPlxu%1}3-55OMtA20xak$%L?pXH%i8fXXcppHj15>`_QgVDJb&Ve~kz&|F!e1+Yx#swtd}}^7Gox-x9W**RD*wr_I+5c; z=ysO3sKEmHX`Rx${FStnp$pBF%s9>o>LmZ@Y!-$GFp1Z41d_g33Ualzwb`|`Brf-FCi6Gn8#r&`Qs?&RiP-+YZk;^c@x3CVdNVFG4zHrrgd6X#zSOArl#~9 z9C%;9el4n~z%Rbn>+={>c=f8%dV#<6eHAh{H@EBGx>;YFF)Mz+-QHM%rKmL zW@%}}h=;PIqQW$syWVm44ZrCq_4%U17yDw_G|xnuCz*;y5HrmHVygv`wHvgn?Nn!0-5O!XKdM=vXoVAWJXMK)^UybKTg>ML&)M;HUcwKhWRD0jedgHo|(npC9RW4nf zZl8YHhcEN-@o_<=#l<1sr3-&eWb$G_Gm4E~S*pvY;I-C!-ke9*Yv}5ZL6zUWeUm?A zoAa=T-(A!wrSXF$Kt(~J7ZMWkCqiKP!POZL|DBYihcn8q?DYqz>>hgYZQ!p&-0RUGFqL1UT)_taamy`A=K)nr3+h?w2=n z=w;pC{IqD@F1opq<#<5yqYeMD2x9wm>v8#P2wKfoqk-qZTI==f9UP8>#BPz`C!BC! zw(>?~mlaQew)<<#`+L2cGkN7UzHFtu3Gu>7iHcc_MgCi-?ccv6F5NDjk+SRS$)M<% z7^KLJ4m;{id!bGRaZiz!bu4;8QL$*lD# z?_2us-@n@)txB+vU@baMD#51t)f9yoz}DBVnIB(X9$nemSAl#qYfL(S+pn$nDoCkw z>NIw&V7N-HgOT>!9JWVImuS#Gdq%puyX(4FTo|3491Pxy%D4LC^QD=Yzz8aV_2YgT zSp@}bk;nMXhsz@u-r*O&`Wz;e1lBTKmODZ4jC@z3zV^$qB4Eje#-&|t?dIraHZ4WS zUZ(2LA1e^W9G#tg-L&IxlMbbL)bBd|Pj1kmNNj6rWo2c&MH_){B_+(Q6?%t*zG&E$ zMuS8xm+N73HKzDKhK6$9vPylkE=abQ+uYvXNVP2Q9P>Kob#!#hE+{~c5;=L`?fvoN z@RW@c!P*RpfC`U!UaHH)}f+d2_8EE&~CA?q0xw1P+b4dTXkmlap26aIcoT^97_J_It#$NJvQZz}#}B zD9*LG@giQ4dWqkRSmv?U3US@16;Tp0GJ}!SX9Gh+-66!xmG-}XL|;5x_IRVNj-H|z zn82oS62!I{OQ^O7|9Zj?iOUSXyHx*0_Bxa!x91J*0qP~a7wvG*s-YML ztPpvys5Zv1aNw+*oaD_d!I&5OWH+8uyN;J5pS=89Jucn^{iYx>ASWoZx3}+Ls;Bkn z@jRQ>AY74c)OE>U=|5hMn^4Y(6Q|cAYAbFGSn~k`39?rX zXu_oD%uVBv!Nn}2Lvq+?$L_c>`8<6uQn2s1=C4TIo%9zz0zXYpJ9@DbFHbj1B4mg+ zl$oK5)yhK(javx?@@8P5vV{^n5uS8|MiLDn2wbQke)Y1PrMeFXQ>d`(XL!)%Mk&CM z+21c7bz;#=hko-|kubi{gu?L%YZ{_XDN=jr;S zK&v>)piaNZjEeQQH(?Nj$D@;Z&f8NYGkX1ODLh%lfDd;|<;Zx!;fRYFqLptT=q;du z$s{~JM#dyeGp?l6ZGhutoKHd!WnoWle3o9KUCKZLCVhT1Jo(nVZ^UP;?P{$NCf; zx0UB*4{LxxPV|v^U4}RS{FTCapNy>~nO@}u_jlz-Bpd5QDzF8QQO99nLbe{%jb`^0 zXPz{ z|GW{a3Y(TbWyIv)*%{iRNzMG_Ve|LzTwGiPw%%4&Ps3&MXwPC&0wg7U1tX=Dltz*- z8tUpI8m*7ldt;u44C^^I!XyCF9k-~1iG3`Yrek7a@+FN)373@2Rb*0=eylJc{aQ{A z3^fJ|3p3>Bt|J!(&W7ly9v3*&F;xjuidh&Kp2m|Q*U`!4gB#z!diA3*DkqvYMg|7c zXHCGWs;V+vrXeTCsMBKCduha!8y(%K$Bt(4k^rVECzn2GgxqgOZ?i3kCoeBg_g32E zA$W=!3yGYZ9EF{Sq$G0utFI40e*z`xu)>|+O9VZXW+5SZP0P(qXy23$*JO%=adW@! z2^~=E;_geHd#RYPWY|AjUvX6T*HXuEDzAGQsd)h}u*loIW#3(czTKN2%GUBgqUmYn zw12tQ{l!nYT&kLma(<7RWH(H;l=F+Sj9*Va?Ki31q3HlIr`9hjBeJV{cvV9oMbxr* zGmaH#P)pj8SdrRnh^BH+vCy+}{*QiX#O+biXmzPQBndF?0rX?%3++qo3td@%sTR7& z=HK{w1`MHa5jKJ{!M?TXR7oGCLQ`)Pd=Y~r5*Zw_1~hg*_8q%3)Wy85srOk&9~P5U zu}BMvirZ4y_at-8D+z_uoO4I!f2<^ad)4b=P|@9hqtTV04n%{5gapW^$zeNGM}D(b zc)@P*TIhb_0!a&l^Da^?zSxyz17=jj^Rm>&B=lRfps`p$VOo~&(niKtdCu958_b5) z>&s`+BR-;Y%gZ03<(S0DM=0IQTiyoV{i4ZBCXkOzLGM=W$ou?4OlN@s(*f(pbj=n zEM)w3t#%`)A#h3lVHZ@Kj%!a6%D3(w%4 z>-Er4J*`b}v&81N;P>EfB+EcAIGASm^fg?mq>Aee<8Jij@X3?Wvn({|={S22)8r#M z4e1`|=3TsduDt4YqAX2O(x%*9m1@}9k>HUES1odOU{;Oz_wvmHrioX zf3L#%Pf#~+E5^?drRuEjQ6n@t3)8qzBg}^V=eu3dWX(CNn6Y?eQQ`~(}w@D+{8W3NY1h8yXtlk3iOo!Y2a(+ zq1hO5nI^(8++~S(PLa>SMd@<0>sTYX^nJ1Z3$4kW=R9O9E9fZQP4^CT8oZwJJPZ%1 z$*&Gi`L5{HCG($ZWb}Kl?3fW$#qKJ_!X~!A*XBCM`fk}VWCwIVB!v1= zW!PV7Im=;WVnWByPX_wTDUzvhsuy{o1(buMoxCW6tCQbc4j>4 zdE6-}DZK|QKU{{M?CE|_0=rY1yV6leF~C1R``{)b_6{BohTMQG~c4`Q5uVnqvP<4_O6&P%bwiLp4~^WIbG3Usd&c8`Kf2@K8o`2eyz_Pv1Gtmi-i3ce}z9L z(wvk?MdTP23mY4;RR8_^7v8wy?Q~FI&u_KZNEjt{FQ}!ZwR-GY>2YDxs8ep&kdl#s z3%JMVs4~|l%W(Mj-GVg9@(Z#14Y8wrUHC%;`v+bt zwW~ZX4x;N*5)uN=_Gj~2TByMRRk`q2(Cb$4$B!SYw*LG`KkBU)Dow{8f4)8Lf8WSe zP5eFZTI=G^ydNN*56HsMgk27#4GrnR?UJA)HKt>I2#I1OO&~8X4+V6`bu&h`;NL^} zO~sOEs3mOoV5ynBM7^x5ZuX zG=jNyZ?R>so4-EYsmBLd!6+Db*$oZkS65e!$9+_G`(+Kt&|6lmuL>3nLSOHB*;O`* ze)B>dB@Vj8O{pn4AnpKNjey_!7hmyB;T{M$!r*8BgM$Oi7l#-L4M>F~ zId7)qALM~y$JGubpl;}R-<=f|7k2{HB7DK7S?N#p087ErnoMm8AbR*>f!+y8Nnsy8 z&gr-;ISINR(N@>gyat3DC=>d#=^n%7j@>zKnj~P?F)oF5n!@#KIK=C9NCxqIG1AcdPe~LJ9N`D@HqSwx#{|K8*XOPMj z*Y4nKv0rr8*|Z3L>uGaG?c;(}h;033U|LZ(76-FVF~YzT6Uug9MYg~&(-FgbE3E35 z`o#hg5Ah;YN~V?o_dxF-xx9ei-7BYtC!(If(;7K!>Vgv#y<>4hBtO7H|MfyXyyv`v zCPJ+CFQr){wM4rqZz!P-$t6jWJ*=1IX6|uTa!^OX=E4=@mhPTwjIMF|u+CDsP0Qjj z5hOl3Xjiz)5yO^x&+v#S<(pz@h&Bqe<8-zzpd`#C9#%R09wig$w!j?2{X;9{^UTgN zx;XXnGntVAxE-?9{)t~H-T*8KJ6;WY2=tlvCOlrGyf7xd$)UY1*# z1cw5e#&1e>U$ZMV{(KG#3Rde`8{M=A_>0xvm?0J*uJx~x~bIpH7I|FE=^0uidl);|X70ccI{y+OTP%r`ff^MMN9C zM)Bj3TTHoUddfPj<;8q_^X75=$(&$olRR_b8qYuWY1AK$I*8{Rf3blbD7F#TOlfY2 zB}+?B1^SOYxq0$={N{06wI>?0L+FRm_}_S&205(+W#7osr!&7O*-*jdy9ZLJ9niNV z|GwU^^!yi46swji-Tq*Gcolu$s8Qye{%Ef4_ApUoxMJc zllfuXAJPTwkkv~_UwtLg(DgjauLszqgVQ-KDJcl(i2*gzJl(2enf{GSHeRHTd3Gm; z5Ri*tNghEp0?(d>@R%~Chyq5`W5s_;vs3Kvufrh=&Q7Rf%@*}(q3nOSRPemjR(n)L zM8?m05O1&QRHz<4EU0DYsR8s!4oGCsVOJkuWNN<>W%48pa$U+*!+MDb#Qhaf1N{Bp zKP7|26%}!es*B+kw;H#PEtTF%_DR8Dg#2l^97)(6Q`hvjy?uQr?6Dy7*BGsX z4t2Q{A^CGMK(zGqC{PE5lo~O+ia1lGq$FL}y!q~!5syUKIi8}0x%rwlo4m6eDG3lsCGj=}NyBa+a8krwGPOdd)SB07iX8Mm~Ba&>f{fp&N434~BEpVE%r zM()|s1Ste}g!iAzFdyo?lza(;@#IK#8Sxx%aHXcDtvM@r%&^jlVqwN$PMGgTfxd*~ zf^SqX=FIM2A*sY9#>8~$8vmM`lk+P)YO~8_yK}bN0g$CKulAp^o-j9vrCtQ z|Lk@`hnV3lxama69orROz(0R-HyVt&L%8urJ4yn9H@HN^%K#xdg7`}F5+0;l{8p7U z!S_%MBuVfK&(EE~MH$dbI$gQoiO{2C8lCCI#iv@2q`$K;NdH_}8N)XMO_;Y9mNb9$tftHtY+L6l$9yfY6bJXmJ99?19OxSoDfc}^D&bk?91wF)4`+6^TswZ+Fk! zff|}uScuWu+Inj?zO`jiTUXb)aV%ak9je1f2wwLCpznIWPIxPcd{ocm?(Pmjg~}OS ztE+k)cglro;nl$dJGN4&Z}y^&?3$M-C@5An=x81j7%s^ah*`)7cQo{&)83~9acbKC zpI!jmpzbdP1qIRS7rBW&!_qJqGCZ2#RIt6Z^$`%6vE4*5u|iTp@C2k+u`v@~G47>A zY;GEPrQWHk$_*=mu^k2Y&zWKiMuKeHDIpCF&CiV|3?@uim|vQ7IPSX(Cd{PE`-Gp~ z0K9^2&gg(>OHxwr&2H4uHg@Q-Nxyyb5`Zqq?_TEng5S3VxI(-BsH+(l)kQ^Si#6<1 zsmX3E45(W)L>6^gJ+EZmuqLpuvSPVYNW-{KRplNGd!I8u`6rK4s9y5=?OVPrMNJYS zUhv8~KEpcgsKw^Prp)TJNXW^fzPoY<^bCXLz_qd#`0DF=khnt3#|l|}{k32_YQ%RU z8cC8X7zvA|aQl2}GMqop8%Q}dphTEpTxUEMd=aPp)iRNl%yPq>Ht6CCgZ0Uld05(aZSx9>>B=xfYmndP9*Jro;o;-DMV?%(>Z7m}RIP2*vB+L!kW z0yy7}C3LCFqyC~{pyM*P3US?hc>J$f#H>Ncu4ORPQ`be$s~XSvlvp*H?x2o)5-Gsn z=e8mj$5amVsarwUDSP^%YeIoKOe*ko!21Z~SMzW2tPN}@@?;WANK@Q}68o?k81z}L zKRkAv&oHYJTr4_m9wWs0tNF zS=-llKPp8Qf{it~lQXh=%z{X|L)*2@716j2@&!Men@dWgFA-KoLfF6`S;dGIn+MJ^Dp#A1f%cKZ;!d z)+0z=0HFXdLm*Mb{ZcMm3h@@6^5JN9xyOk67=ls$O6p_FzrGK_A>6BXd*DYWj76zO z4GO%5mB!`|(RxwGpQs{TUC1k*+vctO?u=APz4A$pE9&IbFn%o_?z`c$Z`!|B^~toW z=)Ar>l3%M5{x)bJ;Uzg1CX!N}N>{g!X=MGXLDmJIg?=brl(&a^4<2Vhm@fnSV_j}+ z^n?pf_x;cKqNRU{o4)Nxz+lg)d`GLyl9=o!navvL>vf~81xwhy&rIN+|&`&x>%V(UgVE`u-X(z~ZM z`zmLj5tj`dw?*M}gydW#g@cog*5=|Xb{-Q?#C>+!aULsqvZLzce0U{`{IrxPU+$ic zDO@lra~|v%p804q9{ets1Q$`*RK(Fw))w!36b>yWS_%+`6!{r$E82+na`V3` zB&}a zjbg0mf1`YSvasm-?zFE5xRPEcvdGP!npHu(jEoG}w1}1xg9zvzouk|$;5~wFCk%l9 zSWnH&tb-I>e~wfT1i38QVSvQ6?d6faPKzrc8X6i48(Z4AK}S3M?(i5vGQn|zNMRk# zXn#7c`2ycn@$ORKZ1qQwailS~tKUh-J;W;6b#zHA(OV6Tjk1R2 zt~SpbnpBi)vC`dz`az)LW<)*UntJ}FB!_*ZUO&ywc@XUA%AE3!<%&ro>+bC>mYSNX$3=m>bvryXlxB+<{2JKV1Y0hjAMgIY1G&3Z z=jPWRKD=0O`(LSMZn7gMkev=_!H3U||JM|@?l^yDcsS1cGW9k%5(K7_Nv9@0Nh#a} zEM!6+hW~A{7F$zO^F1(|fVAgpO-;<^rO1u`sJznBr+U4oAv=eMLH!#NAl2}el{E74 zi_oQJm!R8Q;5FFzR#M)zHIgsb#6yUH59ppbiv`ulnW%wTW|K@rWfmS8GoEy_sIX zVQNxV`)GcA&$MFcxq!h!E{)5M6enza9&NuY#gt{_9wxm+Y8dE+q|ekzf}d zEkW*4T|WexD$^x}K_&?iHEMq-w9_rnj%1T4w>c@^t{CzVwF)Bw4Z|*nlIE;cPVq{2 z@!+2iPoHS2$S!q)XbfUWOs>WWe0OxRbX0%R#v#|GLzs?FII&L)6(A2gY7CN8N66<_ zCX(Y^(KABe(fg=n+1;Ru|DW(yU@v1hI1aI*uimA$alPYnaHBZt9KR`2A5fKq5t0B* z!^GQh_CpcH;Mp=^>Y@5s_M~!INbzNM0ePAFC-0u>Qi`3T4IUPE7u0o2c||6Mcpkil zr*f!};3CdV+X3}9rP0%R2?4y!17xUZllsRNbFfa#KST&*g&{#JgJDw2=?#8=&GOJ> zIkdXe7Zf|8+lvtA_teX3gDY%u4+K4=Wn^Kn=;A>CaFie9DXZG5pU23{iuN8p&~W4J z`ts?yUFif>c`-WUtL@ipbR=LgA0IZRTd4wMR8=epvgQD0+}2{Awen?2vMd+x_3Dng zSFlV!F?QWa#VIm-SsM-JkbxR>W`C>4(8-G$%Kp?5fjHthQ_D6)EVoX$R;Sf6T#rkf zp=Gl9vBZ4XWD!wQHR7<1whcgq;*%H*H4_PaTpkTEi&^)T6h1ds8q!Z?_q&|#kGNbA z`h+d{WaDU(O|42WLfK6jHA0qDPEbmcW5X66`!>Iy#qP+FTmd9W;r|}K)%sL z9cQyTWg(tlc=2+-$p#$fN$tUGGVJ|+8ijeZ@@)1_6G0{gL~`%mkph~71OcY{EdIyQ zzi%{_wd(<#3s+13CnT|_GkYPdC zENf*I7v@k+L~J33tC_v(qnt!E zjW-fs|2597a!OC;w+~r_UtIeRE9XbQgeDoCG;4n?i6t8~z>EmsXl_i^>C4i||6W8+6zUHM-?}}NK zyIKP;FVU5p1z@o%N$B&I1NcS!R*LQ?J7&&hkiT+a+R1tRE5>$o7DJqO@7}2`Go?HS zTl~QUGpyD&2o=gNC_o(WkRE^9#Lu5ZiGO1U^P&j`9Bt%nsrxI49^*M|4X;Vn0r7n1 zhJ+HdU0|rtWm1C#@jX=P*iU0EQx5|%7@}}+a-^e1#EynU|Nf264xfuRXSR-7cuBw( zNYjOI4V9Fz{kfsd$w9PkDGyltsb5#uG|gJnhTw39hdJR2IONO;%O`>XXqqfzBn(j= zS_6j7n8d9|9&5IOsz8*8xr<&!K&2(%47qvZXSCY5iC9yjAdLZZL~R5c`IdA^IC4BLC~ z1tB^ZpkU+%zEJK91`5NS!wU`}ia~Y%c7$n7|E;#Rf*~_1V2QN4mUO?Q41ln3s6oie z!1oXr03sw;{8^BtK%3_TLySLM|7wPhne&`IXlQuYzf%u_99>-UYU0SA5FrEXSfHRl zx2WMA_(QaQ8asWb;EIU{JG&?ngS~LID6_*zi(f*LRs;=CFrO>08q2I|bJow?f z5gEZx5okUr8o`V){6c`*79Ah&pP$bFEk{lPYvF`N$^n>OzDJUhrOHDSzUHq1Hb8{0qIzJtEbcq3Bh!!3lJhLy5PXgKnmb$!& zW)2G%7lB_33qbq429pvE|LHZc9IMJgqW>YliuJZ%82~KVzd7k||3~BK7#J^0jZS(* zYq%e_evASr0b0JOD=3iJ3@tzqkt_IiL|my<1ZbIY`{ung6-4us#?P0ATg? z^*%Qk7#Q4KT(tA|q{RRQbmTgTC%iw^QVswk;bf*!(ACWnpO)rCjL#n z4LR`DF?eVk%ZIJO@mG_&;ik@IKH7 zKsjh?vV8G``hcs%*9;3_&{2WjJRt_3)jN_YHzEFqij2Ny*qAgDV^&<~H}f@>o>z?2 z)6=$m{9ThP_+$k+f2z>XxO6?(lXdW*op0qbzuVy+@9Wu_l#Bfp3;mwcqwL*&-XUlA zwYmSShlPR|CxMbDk!04q9tXMmiYIyNVGD|>tZdJDp0Eqvj{}>3rJ5)Wosf$nj)?mU zqu0vb_;nBoTqgPYgTd&)FXY>2Q$)Z~K2nc^gh-*)XfzVb4E z4wLFS|DCksy32@zm&PDdFfEx7gY?qssbIUMZWj!s71^7bDyeXy*>xk6S+b1NgfeVX zMkuH)+>m@~@=zXCwD`7NtjL~ZxJb5}P59T#!(c=aGAMW{}E6P{_Yv0diSl{JNBML!2(f5xE0S{&E>6V1>r^iI_S-9jZ4Lo@H zVi`r`@q5ub62Fm=fy-XbY~cLyRzr?gkXaL(2zjebjDmu48}KGZaLU%%d40X!`m;ET z5|h#0o`zg2+^ZA0E2~qF;B%H_G!+h4LK3~@jWf6m`Q(-3k<)nKfO-eJEEUjacyzQy zyEMZMy~y32*>V+1c{vBFza|{DRVbx9vxBZMQ^F_A3?HUnuzgcco~QRE2NpJonX$g3uG@!L=8MZNghX z!jvXb3Zi=Tl}GPaYq4$G$d3C4sE|?2lj!TlmwJqrl%*m7-r_F)3)K1X6@4&&4=yOA zCc)R}l7_DEGpD5J(oOlF<5T;X76rtrs-qjfdNo(C#3Yjv>+4_51NrouNNs;X$rAqD}UWfT-gMNw9AFW2j@%CRyy>wZ{e01(V4Ve+)lIRR+IUGiI(qiIv1K{e6meL5e7xRDo2Nk% z;Q-u)=Z_*)fd9a)lq74RM4nv+E|CxjaicV{xrg+Y_OM|IHDTHht zXR!U1lfj5bv%_4AhZh7)#UC(OfD|qe+JO*2D=bW9XJ-f4<4Yi8K-o=Aluw>KLF&H{ zyPk*wuHsi9?X>8;U@d63j*75p0Qmr@OUOs(nEx*t5K@r0Ne!iTy_JNG4afS%#&KNM zEh>MS^{{97HUJy%+dc~+tSOuy!|7!sk1irK6y)VG>ur`oe-9-A)4yJ)FB+nX2h>9Z z3=b;;>-hSz$04qp@tGYr9V-aNcr>EJh!3!i(>e8Y_sv-aA3e!vL?d^T~>{-GiA$$M-m|}D|$c2GCl?I^J0!Y()-~4aNNQAaT zeV>5BmgD2&KcGD0KMx1`)36y|_4)aEv<2eY|F@!-ARV>J75q8|X^>`MaC-Rm6_q`jygP4Yl;v&LdxUW}KYrUk)vA4vCh(~bxW5IU4p4G&+|Yi9%h_^-jPMOLHVQ5RlR8)* z8UHMStNy0apcf0+mPBrkx(5aZI@nb7 zW2{__y4c#AQQQLT7YvG2?i*J`NBImSqkpDE{Q9K@-03ptf{`fwcF{w>xza`e1m2(l zWgQneZEcUk$k@Apt|9r$SzTSdV+;fTaSt#~0RKFw;i$Ko_XSWE694ew5vb=l=@xv$ z_i2vL6TuVieW%ys{+H(kS@J{Pg#X_xJdj7c6b-*89t(Z>3KsLf94mduZ8qO`K^O;w z29#Tb0J-z+yLW2{r+Wu~Uur=5DHZvD6}fPehR~pa!ND&Je`TDU&WaXNhZVQmpnqVl zB}`1V8ta0;08{B_8kxT2omtyU$u95gr1wDis`R>Zk55X%17_shQt z*~<4%=b9XtA5u}hR#)e@VK+eJPvP)60~-6wYld1a>ke=2s+QZ6Ar=;v4&dw3=Sit_ zJ2e5Flqyr-01FuLZM?@K4**^#Wm8`zZ@)ydX+2ZFce-6CZ|{KLj=@3tF4Aoa_Fqjg zNKx|*pJ)(vB9oDao4BXtp@wraFf$u=j{ojn(-U$hVOM#92FZD>BK7!R z>eF>SIKRnB^hX7mN`Q~APLB;PWC?Z=V-Aq)9Gjm25sqmfUxfdq|ukH-!ax(%}2dND6n zA^}B{2o~UlxdKW=tmia@lCCn8oo)gMW6&E`Lieim3)`N6Cd3$eC^APVHZG__q;TO+ z_R~j3^jAbvQYG3~LVkYYV+IF5<-L@JHQjCDS=K)$I#^6>7d&VWn8*GsFst#ZYGQUQ zgsP@0B3*#Yy+1%)l52lu$>wd34RY5+PE-L}RJxF&0%W-ga&fQ*eL4r=^-cyvf*dvZz2I*lhOkKH4nsa{y z&NxoghM?g!@xm#S@XRmIb=}accH%vLmNi6v7i9ycf=f6CIU57ri(u9R;gv7XnK67W z+>ydoYOST;C(_I62|43H*`tFJe<&OPfshM+&yERqdrBwz^9_?@4CZ1xgUK<(O-}tt=SG z)W8_CT~r}EIht|X)iIdHFQKiDW9=IyUabZkgmtlXoYiorwjBd7?0+mW7wpN_bD(dK<5 zwZt~fJ*1%$bs-ZewO-ahS5^8n-$@fOBt5R9740&0hN};!1W$4~`nv7pTCPlrczMR} zyRdD~&by{3{bu0ToQ`eRtT+lulDcnE->H+4R8{3==F%r|6%vCP5ajsiUzUqJlP3!x z_wPfEp59K!*$-UdJZ2VvVG-rc?gJJ{VOV+nA%*-2UXWi_$Gc%61{{S%ag8^=nF1@) z7rJn)Eyu3ug|KCpWr;)KLq)G{d!gih6r0qMW3JcfI+1hJB{8-7-N7Xy{QxR)^Q7Q_ zO~{|71ndq!*E;|v3u#zH-l&_+y=cAAyV{KNf%_8o3gWd&ZQpOekE1Asq5|kR@VJK( zVF%ayvO0~VC;VMH#F!|4olx2cIv@^(leTVBo)AQRi1>E_C?~v-pNz_U1fXC*1Y+c3o4tRr;w9_pt+6p=%3>Ro!`EGQDcQo?&tg%@L>|8 zNnKowzJBBjtR}ReAQh7YUxiRu=Z2u!$w@Y4{jYZ{Kq%5TH%D6i8=cYY zT=wsyZhW>{HW6RTDNpQY7%T}<*+u`IGY9A+dB!ZXBv@!5Mn0h}00NP(KMv-K+0gC@ z>|0sc@7a;~q2G;vs)q=OiRta%O+ne_X>`MiP`4kKQPiYOAXH zZ0gs%h${w@!2*-)nvIPfH1gG%KYWC805NGtCG??l&UQ)(-I?YY^cf4a(Czwi)J$nl84%2ACu z5QFG=dDS+*6#yXv^YoFut4L<0iX?GaMj{~ov-z^?Ibg;Z=u3X} z0gD9K7rl0{rQ>|D4wLlA=#`RT>r|LPN3I@@Dk7 zvMVd24?7sU@}h&0+obSHrhnkmrDDOO$CYijHswxSMJ~6}K3#H6?VkZRO6VNxHUl3x z7dyy_)>2JY$cY|C$nd~(C=CTYuld}AO!{Xm>cD+Oj#acg&#^*S2u8gy`zX8 zw)p@}41Kk=v8<>lxVLy1EiWIk ziQziJ0G7{0e>0VN^(7PKXag1K0?jBVB_msc`nHu07Q8wV)h zkSj`Vb~XfRs(oj~nSvzgLU9h0^uj^eE6C3h7OG@zMV%rYTKzw>Twx%wNyv3} z2Z)Z@-VKYS-+ErKx0ocTHV3EKu9KZXh6|ViM{c~^-IJqkg z>ZF2iKu#FBpe>8n6i>;M#u%L2R<^bTzy%Gg^#FMznLd-DVgvE$>hKPwch@nuqJnd^ zidi|`M~5RwxzO)p1^81+My4;Hqu<8N43Po_T!@E<=W@Zr-Q9L;m{OGXE<+q$;3)RLe%yx@QUz8$uoHs+5o>p*RFk-T(xB>y-1tMIEmAl)5IwuB_&5IT zPTmq6rKhLMTW1AFfq(%lSBp&!$I}|R0k_m}>|7@IP+nkMmyEhFx3a>yZ2JEFA=r>+ zpFVBl2LOK~0QE15{}La+?*Hl!e2L0MZU-x|fLXxCNMmASd5R|@fxfe|vjeKV92|JX z&O8D80Et}Y7Hh;}AWBL~B0pNA348VRA-9ATOBrV522*Q?lA)pD_-_oS=a0%FxN#AC zU5BF|yoWwegD*k!N)A+=DDHhzNE!+N3UD?;0Tu*7Z*G>Gu`3sBE%1uv0I|GI9`yH@ z%t`+DJQW~Wq(a^PpEENu5`@257;<|+Yrc`z6xL*g$JdmAATT|9kq%_lem(zOge7P+ z9dqb5UD_3_Mo>i!6A*A2zLB+*zvLtq{Izk{&!;Hu;c?@?RSP1oYRNRXD^&T5Ow7N3 zjTKGAg7~ix>%wLH=A| zFUX@5f6eTU$i3BT3O1Fqv$u!!KIStYbwA?Y-q85y#<(?_p@2mA8}Wr&Ep|jAA1plE zoA)$56W5HOMxVJg)1*?}pOYGdV)M)&=Kf%_gSwBLygd3??~Z8luP4c@AoE>$UY=zwD9kGECRFbp03fzx0!4zH68I)y_kj!M%bah(}E& zZE@!T9JW zPO4PA*^y&GqQ-=d@~22zFrAYkMsA7obG02&xapkmC^EtSQOuRNL)HH8LyRTcASQ&- zB*`8cyKF<22$Af{5@KxGvW|U?>|~7)g=F8iUS(ghWJ#2L$eQ){%=>-+ho9?mU1!X> z<~im(=XvhW{kiX3&@8F{!7^Ks#W!PEoz1!|@nzP4uU280!eHwU9>YXGlDo|1HyWCQ z@7yxH7<}r=Gb4kP$@a>0mI6HNa&Xg6DsSF7-2Ig3+7ni~lonWK^cmxuNF#cr%hc2m zcJ-2BMjYAqeyj_R)o{)zw?ZM^r{ax3Huu=S+Sm)9^-K^uSk3;)4-M3xb+SVeTs5AW zZS-sG)+gpBK8%0o5SpGb@zwC_Puv%a?wmTTQk41=Rb`igU`8ZTH}W~#?ATaHY%C4%l}Gbspy6bMiB61y+hLYtYHe2_>_8Igmi?{5w)ukv4X(&<@h z*E6PZT#$yZQ9c&WxKQ|swY;i|K_y?u#ooT2tZZ5G7_&RxNG61rzv=EkBT{}w%4n3% zX~4^a>TX9E$@;S-7v@8d7B>d`3*lc(G_xEn#0PbQlLxYcMt*Gw>ew;}uT7uRyb(Cu z<5fa1(pxQ~t7&EQoq*l%uFP=qvk(EV+|T-&W}9~|rpL)fob^{hP?9ji!@0AT_#1YU zauq}2UvzfIUU=T%)PsFJ+HcX7S4uU$6#UhmJ)~cc972fR&RaUHI6@s3QE00Trrk;q|UN1)*Zr@c=pl~Dk%>$iB z*B*_}qp=^K+L)NMu|~R^$5%R6q`EUvEfBke-Nea%9;=gF*>pSd+>hJ*NYRrK-}uk; z%wNBf`(rDOr_C*HMJ-;(UfCBkF}I1xeLFvARx`oApp!~jSW#u{539_SqOS=t7v}#U zpAzsbb)LC??*qE9q zK!rKDcC;7@*!;BfqP}ay)>4(EZX@tg-QF(^{T@qg|BCVX&$>S__kPb`pS!jsgbMQA zZ1p8Ss?|%nncM8vZB}pXGxKd=L+xrD625oKuWSesL4Hm(kx9`eR1xv> zbWEoERD!3bv^2NwUr-DRe=LmB6LtNi;V|1M@h(=FMNH?hg(}@kzR*q~fI4!4MF;S4 zsu~&~Z{o6mNj(YYhn#2L-jcFFr#84h0I2xQ2E6Q{uPmQQJtM)-G;&XlS}lQK34~H* zUr3tzpSlwghW=wd!H1yKf`FzTXpMl%Y|x3w6iAu={BZ&7M;kB zbiYfmLvhK-m=H95>~dD!R26n3g?`6;0KEZlYMhIKXzSs-|`aI45R( zcQgqhCm>S6lophaQC&em9S9_jG@dGoKSDTehb{vG5()r1IMoCawS=6pr+{8M=5GTo z&Ve7pZ`64N1iHbBN>+`hVBp?8_M86X2%4UOpddo2f#?!i{u%?2Q3bRVAh&~eCu%0( z`!>Lw<|1kF34hd!icU^W1~(cTrF{Ya>TD7LI!|dyK%vMnKR6w(oQwTczK0Ts|^6F%b?zO9;s+AayEi4v$e* zCT%no3}{`Qm(8YS1~=(P^E3CQE&CddL0?JJ(d?N!v#Rm`^FimW)mc9@5YX&@L{xt zGuv!NAZUd$OY1stz4GJ8MUDNAR*FtzcYxwA0$?k^Lo3?+A>Iee823OoX=L5;KO-y0{Vk1>g5zueoyH7ri z>W&8$!eCOWeanM#V~NF60mBavn52pzB?!zPA^$U9LJ&>wC9RT$v`$0EmaW_%f4 zmcs%zkfxD>JvUSRI{MX8{910@@6n*A=BgxJs9(fTe>@upxAj8C1an%1g=g-{HKyGW(XF(MG?FP|xTtCe?3`CyyN9q2VVP!EQ?lcyYia zOAd0Lo2t9+JGe((?f_%5ykO(|i6^Er{PX^F;W@8?=qImC@sB2lOi)gYTEGpTvB}kmKU3? zwtXywDt{?jOy#X>zW=trp2I?_PJbz~J9%^5 z)fRn%4|GCN4;Q{CL7k5{X#n^A*qq-} zb*Y3LLTBTsj+b88O7F_z=cmzpYKAigez^2xyMBzYy)a@d%T9 zhtgYhQuCj_wGVkxnYfe9Z#<(tHD=BT|Cm=~VOdoCp(DPQH1Q1`Sf?(Xb)<{CUD{88 zeupDK_VtCD_RETvh{kfJS_ggSY5O0p(?1SsM+GP#?YYD#t(HrWki>p??-+@yk&Nh( zCxUM>Tr)G=(4Ku_lkPHtUT1l2&Y+!iwO$6U$km>v!423yYHy!Vk{RAmB}T+L>l{c~l$G_vmFC)Zfbun%n&P$c zptcYyFJQ>HQn@7qnigp2Cv-|e^C-Yy3f{cAlYpAcW}qeo zuYl?t#mzmtWs`sc<>QlAH}h=U;={RaOFaS_4PZFWH>Os;2*w5Kpgn(j8MJ&|Z+*6r_xKtv8YYma>_cysF;s53HR=N+D_FE@uo zByQ;$!3*GLSM1z`P%59#`i)K=b+irI{dL$upSVs@rGCsGDOMvFULb-?GLh_mG;8hY z!~9_IgXD|7*m`fNUuM(^KnfF=7>l2*CEXoZr(Ty(l6cO8M@WF9pPsbJ;!>P#>6_C$bn6qz_%~Bbu1=|eY zRXL$lK_9y>C}~7fM_>J0TuRfE6~k+wqzEqI^TEEl?H-J&%S)&NdKem5lo(}AUts=7 zqcwJ%SAQ)x7mT~;)CQAmDrj3MP)I7JpiQ()#QONcp!ru-#b#k5k!(~Hq#r%A-eQ6O z{;tNu54Ki%r0;Ui;9AC^y(aP%m2ByomMhV|E#dbMxJ$jRDY=w z6i(oIQXs0{j2+yV?Utkzdb<00lx)i8NcJA11cIJ&dCMd0SmFtY z(4+fkOI4m6{1zTTZ4+3ZEPR;hb_~`s1;w^_ZA0{4G{HFlu(an09;5D*;zmOssX;}- zmCzN>$@TDm2jS*`x`{_ou*ASXYklH_#Raqs`HdLOZMXyQ{6kRoK$ule?2n2bc1561 z2HB_OE>9RWJ-OkefE$X8iXt?8*T2{4Zn*ID3Zb|_;@!0N{rkj;{{Z-`ZE6||5=h^I zW$ALL67H#kLnjF2Bs9lCWzvB^$j{3oHWifOds&H~hXc7n8~)%b5=jJf1_Td=vU14O z6CFm&jwiU@zP>Jy!cgHbDFUpdgaiaaZXxKsj0wc#Q5B6Ev}Ui;=2Y&8K;~Z-G5Y1) zCtH6b98PNQD1lE_LPa6}J)m5ca&L~)l>WA2!l5U1*FG@;yh|BAvaDE>b;f?_CdIv` z(h}BV>Hog)tk=}gDgX`dP|{<26GTvPy(cJ8bInzGI0dq1zV(}Pgw7Ck@HU@l zO%NZ_x29KD`En_&mHE@piSN6&0^|+$m_=-?GTZW8o`@P}-m}Dsde?7R&zy%E9Wc08 z7^WFT-`_ze-PD~Od#!b1Gh&hHCQbzHCcsKZ@U6yQNRn<|y#`h=?UeUq{_oC-iZc6h zj|;osSSsIz|FJ6LeNaf%Fc8!7>4C-bF7%b_9x>6%iSvngDwv0jqTym^N;4( {output}" \ No newline at end of file + "samtools view -@ 16 -f 4 {input.i1} > {output}" \ No newline at end of file diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index 47763ee..a9321bc 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -1,7 +1,6 @@ rule extract_tags: input: - i1 = "results/cellranger/{sample}/unmapped_reads.sam", - i2 = "results/kraken2/{sample}/{sample}.kraken" + "results/cellranger/{sample}/unmapped_reads.sam" output: "results/count_matrix/{sample}/count_matrix.tsv" threads: 16 @@ -9,8 +8,9 @@ rule extract_tags: mem_mb = 40000 params: p1 = "results/count_matrix/{sample}/", - p2 = "results/cellranger/{sample}/{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" + p2 = "results/cellranger/{sample}/filtered_feature_bc_matrix/barcodes.tsv.gz", + p3 = "results/kraken2/{sample}.kraken" log: "results/logs/count_matrix/{sample}_bam_extract.log" shell: - "python3 scripts/bam_extract.py -i {input.i1} -k {input.i2} -b {params.p2} -o {params.p1}" \ No newline at end of file + "python3 {config[scripts_dir]}/bam_extract.py -i {input} -k {params.p3} -b {params.p2} -o {params.p1}" \ No newline at end of file diff --git a/rules/kraken2_mapping.smk b/rules/kraken2_mapping.smk index e3c48b2..9338281 100644 --- a/rules/kraken2_mapping.smk +++ b/rules/kraken2_mapping.smk @@ -2,11 +2,9 @@ rule kraken2_mapping: input: i1 = "data/{sample}/{sample}_S1_L001_R2_001.fastq.gz" output: - o1 = "results/kraken2/{sample}/{sample}.kraken", - o2 = "results/kraken2/{sample}/{sample}.report.txt", - #o3 = "results/kraken2/{sample}/{sample}_classified_out.fastq", - o4 = temp("data/{sample}_test.txt") - priority: 95 + o1 = "results/kraken2/{sample}.kraken", + o2 = "results/kraken2/{sample}.report.txt", + priority: 90 resources: mem_mb = 20000 threads: 16 @@ -17,5 +15,4 @@ rule kraken2_mapping: shell: """ kraken2 --use-names --threads {threads} --db {params.p1} --report {output.o2} --output {output.o1} {input.i1} 2> {log} - touch {output.o4} """ \ No newline at end of file diff --git a/rules/report.smk b/rules/report.smk new file mode 100644 index 0000000..5ce3644 --- /dev/null +++ b/rules/report.smk @@ -0,0 +1,20 @@ +rule kraken_reports: + input: + i1 = expand("results/count_matrix/{sample}/count_matrix.tsv",sample=samples) + output: + o1 = "results/kraken_plots/Familywise_tax_readcounts.tsv", + o2 = "results/kraken_plots/Specieswise_tax_readcounts.tsv", + o3 = "results/kraken_plots/Clustermap_Familywise_log10.png", + o4 = "results/kraken_plots/Clustermap_Specieswise_log10.png" + priority: 10 + resources: + mem_mb = 1000 + params: + p1 = "results/kraken_plots/", + p2 = "results/kraken2/" + log: + "results/logs/plots/kraken_plots.log" + shell: + """ + python3 scripts/kraken_plot.py -i {params.p2} -o {params.p1} + """ diff --git a/scripts/synapse_fetch.py b/scripts/synapse_fetch.py index e1463d7..60c7aab 100644 --- a/scripts/synapse_fetch.py +++ b/scripts/synapse_fetch.py @@ -104,7 +104,7 @@ regex=True) df2_pivoted['Samples'] = df2_pivoted['Samples'].str.replace(r'L0*(\d+)', r'L\1', regex=True) -df2_pivoted.to_csv(args.output_file_dir + "metadata.tsv", +df2_pivoted.to_csv(args.output_file_dir + "synapse_samplesheet.tsv", sep="\t", index=False) print("\nRetrieving Data Successfull...") From 51a5b28ee7c202ea9ad937e06462bf9f2400c07d Mon Sep 17 00:00:00 2001 From: Saim Date: Wed, 9 Aug 2023 15:14:09 +0200 Subject: [PATCH 30/33] Updated config.yaml --- config.yaml | 12 ++++++------ samplesheet.tsv | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 samplesheet.tsv diff --git a/config.yaml b/config.yaml index 89116d8..ce92ccf 100644 --- a/config.yaml +++ b/config.yaml @@ -1,7 +1,7 @@ -"samplesheet" : +"samplesheet" : "samplesheet.tsv" "mode" : "synapse" -"local_files_dir" : -"kraken_db": -"cellranger": -"transcriptome" : -"scripts_dir": \ No newline at end of file +"local_files_dir" : None +"kraken_db": "Path to Custom KrakenDB" +"cellranger": "Path of CellRanger Executable" +"transcriptome" : "Path to CellRanger Reference Transcriptome" +"scripts_dir": "scripts/" \ No newline at end of file diff --git a/samplesheet.tsv b/samplesheet.tsv new file mode 100644 index 0000000..6668501 --- /dev/null +++ b/samplesheet.tsv @@ -0,0 +1,2 @@ +Samples R1 R2 +D17-8765_S1L1 syn18641014 syn18641249 \ No newline at end of file From 37d30d9f386216effbe25d75ff1d0e70eb997551 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 2 Oct 2023 13:40:24 +0200 Subject: [PATCH 31/33] Added additional Count-matrix reporting based on Kraken-classified reads --- Snakefile | 7 +++++-- env.yaml | 4 ++-- rules/download_samples_or_copy.smk | 3 +-- rules/extract_tags.smk | 11 ++++++++--- rules/kraken_processing.smk | 23 +++++++++++++++++++++++ rules/report.smk | 4 ++-- scripts/bam_extract.py | 3 ++- 7 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 rules/kraken_processing.smk diff --git a/Snakefile b/Snakefile index e15be4e..832afa5 100644 --- a/Snakefile +++ b/Snakefile @@ -11,6 +11,7 @@ samples = list(list_of_samples.Samples.unique()) include: "rules/download_samples_or_copy.smk" include: "rules/kraken2_mapping.smk" include: "rules/cellranger.smk" +include: "rules/kraken_processing.smk" include: "rules/extract_bam.smk" include: "rules/extract_tags.smk" include: "rules/report.smk" @@ -21,10 +22,12 @@ rule all: expand("data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), expand("results/kraken2/{sample}.kraken",sample=samples), expand("results/kraken2/{sample}.report.txt",sample=samples), - directory(expand("results/cellranger/{sample}/{sample}/",sample=samples)), + expand("results/cellranger/{sample}/{sample}/",sample=samples), + expand("results/kraken_reads/{sample}_kraken_reads.sam", sample=samples), expand("results/cellranger/{sample}/possorted_genome_bam.bam", sample=samples), expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples), + expand("results/count_matrix/{sample}/kraken_reads_count_matrix.tsv",sample=samples), "results/kraken_plots/Familywise_tax_readcounts.tsv", "results/kraken_plots/Specieswise_tax_readcounts.tsv", "results/kraken_plots/Clustermap_Familywise_log10.png", @@ -34,4 +37,4 @@ onsuccess: print("sc-VirusScan Pipeline finished successfully!") onerror: - print("sc-VirusScan Pipeline has failed!") \ No newline at end of file + print("sc-VirusScan Pipeline has failed!") diff --git a/env.yaml b/env.yaml index 1c13e3d..76d6d62 100644 --- a/env.yaml +++ b/env.yaml @@ -5,11 +5,11 @@ channels: - defaults dependencies: - kraken2 - - samtools + - samtools==1.12 - entrez-direct - parallel-fastq-dump - pandas - snakemake - seaborn - pip: - - synapseclient \ No newline at end of file + - synapseclient diff --git a/rules/download_samples_or_copy.smk b/rules/download_samples_or_copy.smk index 2e1cf04..d6c6292 100644 --- a/rules/download_samples_or_copy.smk +++ b/rules/download_samples_or_copy.smk @@ -30,10 +30,9 @@ rule download_samples_or_copy: mv data/{wildcards.sample}/*_R2_* data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz else - mkdir tmp/ + mkdir -p tmp/ parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip --tmpdir tmp/ mv data/{wildcards.sample}/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz mv data/{wildcards.sample}/{wildcards.sample}_3.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz - rm -rf tmp/ fi """ diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index a9321bc..f438a9c 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -1,8 +1,10 @@ rule extract_tags: input: - "results/cellranger/{sample}/unmapped_reads.sam" + i1 = "results/cellranger/{sample}/unmapped_reads.sam", + i2 = "results/kraken_reads/{sample}_kraken_reads.sam" output: - "results/count_matrix/{sample}/count_matrix.tsv" + o1 = "results/count_matrix/{sample}/count_matrix.tsv", + o2 = "results/count_matrix/{sample}/kraken_reads_count_matrix.tsv", threads: 16 resources: mem_mb = 40000 @@ -13,4 +15,7 @@ rule extract_tags: log: "results/logs/count_matrix/{sample}_bam_extract.log" shell: - "python3 {config[scripts_dir]}/bam_extract.py -i {input} -k {params.p3} -b {params.p2} -o {params.p1}" \ No newline at end of file + """ + python3 {config[scripts_dir]}/bam_extract.py -i {input.i1} -k {params.p3} -b {params.p2} -o {output.o1} + python3 {config[scripts_dir]}/bam_extract.py -i {input.i2} -k {params.p3} -b {params.p2} -o {output.o2} + """ \ No newline at end of file diff --git a/rules/kraken_processing.smk b/rules/kraken_processing.smk new file mode 100644 index 0000000..802dde6 --- /dev/null +++ b/rules/kraken_processing.smk @@ -0,0 +1,23 @@ +rule kraken2_processing: + input: + i1 = "results/cellranger/{sample}/possorted_genome_bam.bam", + i2 = "results/kraken2/{sample}.kraken" + output: + o1 = "results/kraken_reads/{sample}_kraken_reads.sam", + priority: 90 + resources: + mem_mb = 20000 + threads: 16 + params: + p1 = "results/kraken_reads/{sample}_ReadIds.txt", + p2 = "results/kraken_reads/{sample}_krakenReads.bam" + log: + "results/logs/kraken2/{sample}.kraken.log" + shell: + """ + mkdir -p results/kraken_reads + awk -F'\t' '$3 != "unclassified (taxid 0)" && $3 != "Homo sapiens (taxid 9606)" {{print $2}}' {input.i2} > {params.p1} + samtools view -N {params.p1} -o {params.p2} {input.i1} + samtools index {params.p2} + samtools view -h {params.p2} > {output.o1} + """ \ No newline at end of file diff --git a/rules/report.smk b/rules/report.smk index 5ce3644..4b0f3ee 100644 --- a/rules/report.smk +++ b/rules/report.smk @@ -8,7 +8,7 @@ rule kraken_reports: o4 = "results/kraken_plots/Clustermap_Specieswise_log10.png" priority: 10 resources: - mem_mb = 1000 + mem_mb = 40000 params: p1 = "results/kraken_plots/", p2 = "results/kraken2/" @@ -16,5 +16,5 @@ rule kraken_reports: "results/logs/plots/kraken_plots.log" shell: """ - python3 scripts/kraken_plot.py -i {params.p2} -o {params.p1} + python3 {config[scripts_dir]}/kraken_plot.py -i {params.p2} -o {params.p1} """ diff --git a/scripts/bam_extract.py b/scripts/bam_extract.py index f3b8dde..2193d44 100644 --- a/scripts/bam_extract.py +++ b/scripts/bam_extract.py @@ -13,6 +13,7 @@ help="Filtered Barcodes TSV file from CellRanger") parser.add_argument("-o", "--output-file", metavar="PATH", + default="count_matrix.tsv", help="Path to your output file") args = parser.parse_args() @@ -69,4 +70,4 @@ result = merged_barcodes.pivot_table(index='Tax_ID', columns='CR', values='Read Name', aggfunc='count').fillna(0.).astype(int) -result.to_csv(args.output_file + "count_matrix.tsv", sep='\t') +result.to_csv(args.output_file, sep='\t') From 5c476ad768efff0afbdd9cc0807f72ec791906b2 Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 2 Oct 2023 13:51:38 +0200 Subject: [PATCH 32/33] Minor Updations to Slurm Resources --- Snakefile | 2 ++ rules/cellranger.smk | 5 +++-- rules/extract_bam.smk | 6 +++++- rules/extract_tags.smk | 4 ++-- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index 832afa5..7e84105 100644 --- a/Snakefile +++ b/Snakefile @@ -16,6 +16,8 @@ include: "rules/extract_bam.smk" include: "rules/extract_tags.smk" include: "rules/report.smk" +localrules: download_samples_or_copy + rule all: input: expand("data/{sample}/{sample}_S1_L001_R1_001.fastq.gz", sample=samples), diff --git a/rules/cellranger.smk b/rules/cellranger.smk index 476ca35..70cec00 100644 --- a/rules/cellranger.smk +++ b/rules/cellranger.smk @@ -6,8 +6,9 @@ rule cellranger: o2 = "results/cellranger/{sample}/possorted_genome_bam.bam" priority: 80 resources: - mem_mb = 26000 - threads: 30 + mem_mb = 80000, + runtime = 1200 + threads: 20 log: "results/cellranger/{sample}/{sample}.cellranger.log" params: diff --git a/rules/extract_bam.smk b/rules/extract_bam.smk index 37bf776..79ba725 100644 --- a/rules/extract_bam.smk +++ b/rules/extract_bam.smk @@ -3,7 +3,11 @@ rule extract_bam: i1 = "results/cellranger/{sample}/possorted_genome_bam.bam" output: temp("results/cellranger/{sample}/unmapped_reads.sam") + resources: + mem_mb = 4000, + runtime = 120 + threads: 16 params: "results/cellranger/{sample}/{sample}/outs/possorted_genome_bam.bam" shell: - "samtools view -@ 16 -f 4 {input.i1} > {output}" \ No newline at end of file + "samtools view -@ {threads} -f 4 {input.i1} > {output}" \ No newline at end of file diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index f438a9c..d2d3715 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -5,9 +5,9 @@ rule extract_tags: output: o1 = "results/count_matrix/{sample}/count_matrix.tsv", o2 = "results/count_matrix/{sample}/kraken_reads_count_matrix.tsv", - threads: 16 + threads: 8 resources: - mem_mb = 40000 + mem_mb = 60000 params: p1 = "results/count_matrix/{sample}/", p2 = "results/cellranger/{sample}/filtered_feature_bc_matrix/barcodes.tsv.gz", From 41f50ad547ae1cf002ca5e8152318c1162a544cf Mon Sep 17 00:00:00 2001 From: Saim Date: Mon, 2 Oct 2023 13:54:25 +0200 Subject: [PATCH 33/33] Minor Update to Snakemake rule --- rules/kraken_processing.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/kraken_processing.smk b/rules/kraken_processing.smk index 802dde6..92eca73 100644 --- a/rules/kraken_processing.smk +++ b/rules/kraken_processing.smk @@ -12,7 +12,7 @@ rule kraken2_processing: p1 = "results/kraken_reads/{sample}_ReadIds.txt", p2 = "results/kraken_reads/{sample}_krakenReads.bam" log: - "results/logs/kraken2/{sample}.kraken.log" + "results/logs/kraken2_processing/{sample}.log" shell: """ mkdir -p results/kraken_reads