14 create test dataset (#23)

* Delete seqc2_testdata_aceseq directory * ACE-seq test dataset * testdataset additions * get beagle ref automatically * annotations testdata * testdata * small changes * beagle ref hg37 added to igenomes * testdata is running hg19
ghga-de · Jan 23, 2024 · f7f8a40 · f7f8a40
1 parent 4a81ce4
commit f7f8a40
Show file tree

Hide file tree

Showing 65 changed files with 286,560 additions and 269 deletions.
diff --git a/.../artifact.homoDels.potentialArtifacts.txt → .../artifact.homoDels.potentialArtifacts.txt b/.../artifact.homoDels.potentialArtifacts.txt → .../artifact.homoDels.potentialArtifacts.txt
diff --git a/assets/centromers.hg19.txt → assets/hg19/centromers.hg19.txt b/assets/centromers.hg19.txt → assets/hg19/centromers.hg19.txt
diff --git a/assets/hg19/hg19_GRch37_100genomes_gc_content_10kb.txt b/assets/hg19/hg19_GRch37_100genomes_gc_content_10kb.txt
diff --git a/assets/hg19_cytoBand.txt → assets/hg19/hg19_cytoBand.txt b/assets/hg19_cytoBand.txt → assets/hg19/hg19_cytoBand.txt
diff --git a/assets/hg19/hg19_gaps.txt b/assets/hg19/hg19_gaps.txt
diff --git a/...fact.homoDels.potentialArtifacts.hg38.txt → ...fact.homoDels.potentialArtifacts.hg38.txt b/...fact.homoDels.potentialArtifacts.hg38.txt → ...fact.homoDels.potentialArtifacts.hg38.txt
diff --git a/assets/centromers.hg38.txt → assets/hg38/centromers.hg38.txt b/assets/centromers.hg38.txt → assets/hg38/centromers.hg38.txt
diff --git a/assets/hg38_cytoBand.txt → assets/hg38/hg38_cytoBand.txt b/assets/hg38_cytoBand.txt → assets/hg38/hg38_cytoBand.txt
diff --git a/bin/estimateHRDScore.sh b/bin/estimateHRDScore.sh
@@ -2,7 +2,7 @@ set -euo pipefail
 # Copyright (c) 2017 The ACEseq workflow developers.
 # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt).
 
-usage() { echo "Usage: $0 [-p pid] [-i jsonfile] [-m legacyMode] [-b blacklistFileName] [-s sexfile] [-c centromers] [-y cytobandsFile]" 1>&2; exit 1; }
+usage() { echo "Usage: $0 [-p pid] [-i jsonfile] [-m legacyMode] [-b blacklistFileName] [-s sexfile] [-c centromers] [-y cytobandsFile] [-x chrprefix]" 1>&2; exit 1; }
 
 while [[ $# -gt 0 ]]
 do
@@ -43,6 +43,11 @@ do
 			shift # past argument
 	    	shift # past value
 			;;
+		-x)
+			chrprefix=$2
+			shift # past argument
+	    	shift # past value
+			;;		
 	esac
 done
 
@@ -110,7 +115,7 @@ do
 	fi
 
 	#this file could be written out and sorted according to chromosomes
-	smoothData.py -f $combProFile.tmp  -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp && \
+	smoothData.py -f $combProFile.tmp -p $chrprefix  -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp && \
 	removeBreakpoints.py -f $combProFile.tmp -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp
 	if [[ "$?" != 0 ]]
 	then

diff --git a/bin/prepare_beagle_ref.sh b/bin/prepare_beagle_ref.sh
@@ -0,0 +1,52 @@
+set -euo pipefail
+# Copyright (c) 2017 The ACEseq workflow developers.
+# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt).
+
+usage() { echo "Usage: $0 [-r ref -m memory]" 1>&2; exit 1; }
+
+while [[ $# -gt 0 ]]
+do
+  key=$1
+  case $key in
+  		-r)
+			  ref=$2
+			  shift # past argument
+	    	shift # past value
+			;;
+      -m)
+			  memory=$2
+			  shift # past argument
+	    	shift # past value
+			;;
+	esac
+done
+
+if [[ ${ref} == 'hg37' ]]; then
+  for i in {1..22}; do
+    echo $i
+    echo https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chr$i.1kg.phase3.v5a.b37.bref3
+    wget https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chr$i.1kg.phase3.v5a.b37.bref3
+  done
+
+  wget https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chrX.1kg.phase3.v5a.b37.bref3
+
+  wget https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh37.map.zip
+
+  unzip plink.GRCh37.map.zip
+
+else
+
+  for i in {1..22}; do
+    echo $i
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+    (zcat ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | head -n 300 | grep "#" ; zcat ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | grep -v "#" | sed 's/^/chr/') | java -Xmx${memory}M -jar beagle.jar > ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3
+  done
+
+  wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+
+  wget https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip
+
+  unzip plink.GRCh38.map.zip
+
+fi
+
diff --git a/bin/python_modules/Options.pyc b/bin/python_modules/Options.pyc
diff --git a/bin/python_modules/Tabfile.pyc b/bin/python_modules/Tabfile.pyc
diff --git a/bin/python_modules/__init__.pyc b/bin/python_modules/__init__.pyc
diff --git a/bin/smoothData.py b/bin/smoothData.py
@@ -17,6 +17,7 @@
 parser.add_argument( '--file',	'-f', type=file, help="segment file with copy number information" )
 parser.add_argument( '--out',	'-o', default=sys.stdout, type=str,  help='outputfile' )
 parser.add_argument( '--maxDistToNext',	'-m', default=1000, type=int,  help='maximum allowed distance between segments to be merged' )
+parser.add_argument( '--chr_prefix',	'-p', default='chr', type=str,  help='' )
 # maxDistToNext is used only used for first segments [in chromosome | after large gap]
 
 
@@ -26,6 +27,7 @@
 
 args = parser.parse_args()
 maxDistToNext=args.maxDistToNext
+chr_prefix = args.chr_prefix
 out = args.out
 cutoff = 0.5
 maxLen = 3000000
@@ -159,8 +161,6 @@ def merge_first_segment(prior_line, newline):
 		prior_line["length"] = int(int(prior_line["end"]) - int(prior_line["start"]) +1 )
 		return(prior_line)
 
-
-
 if __name__ == "__main__":
 	#read first two lines of file before looping over all lines
 	prior_line = infile.readline()
@@ -234,7 +234,13 @@ def merge_first_segment(prior_line, newline):
 			prior_line = merge_gap(prior_line, newline)
 			newline=None
 
-	#print last line(s)
-	out.write( "\t".join( str(prior_line[key]) for key in infile.header ) + "\n" ) 
-	if newline:
-		out.write( "\t".join( str(newline[key]) for key in infile.header ) + "\n" )
+	if (chr_prefix == "chr"):
+		#print last line(s)
+		out.write( "\t".join( str(prior_line[key]) for key in infile.header ) + "\n" ) 
+		if newline:
+			out.write( "\t".join( str(newline[key]) for key in infile.header ) + "\n" )	
+	else:
+		#print last line(s)
+		out.write( "\t".join( str(prior_line[0][key]) for key in infile.header ) + "\n" ) 
+		if newline:
+			out.write( "\t".join( str(newline[0][key]) for key in infile.header ) + "\n" )	
diff --git a/conf/dkfz_cluster_hg37.config b/conf/dkfz_cluster_hg37.config
@@ -21,7 +21,7 @@ params {
     max_time   = '48.h'
 
     // Input data
-    input  = 'seqc2_testdata_chr21_hg37/samplesheet_37.csv'
+    input  = 'testdata/samplesheet.csv'
 
     // workflow parameters
     outdir                     = "results37"
@@ -47,10 +47,10 @@ params {
     chr_prefix                 = ""
 
     // Beagle reference
-    beagle_reference           = "${params.data_path}/tools_data/Beagle"
-    beagle_genetic_map         = "${params.data_path}/tools_data/genetic_maps"
-    beagle_ref_ext             = "bref3" // vcf | bref | bref3
-    beagle_map_ext             = "map"
+    beagle_ref                 = "${params.data_path}/tools_data/Beagle"
+    plink_map                  = "${params.data_path}/tools_data/genetic_maps"
+    //beagle_ref_ext             = "bref3" // vcf | bref | bref3
+    //beagle_map_ext             = "map"
 
     // Annotation files
     dbsnp_snv                  = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
@@ -63,8 +63,8 @@ params {
     centromer_file             = "${params.data_path}/stats/hg19_gaps.txt"
 
     // HDR estimation
-    blacklist_file             = "assets/artifact.homoDels.potentialArtifacts.txt"
-    cytobands_file             = "assets/hg19_cytoBand.txt"
+    blacklist_file             = "assets/hg19/artifact.homoDels.potentialArtifacts.txt"
+    cytobands_file             = "assets/hg19/hg19_cytoBand.txt"
 }
 
 // Perform work directory cleanup when the run has succesfully completed

diff --git a/conf/dkfz_cluster_hg38.config b/conf/dkfz_cluster_hg38.config
@@ -32,7 +32,7 @@ params {
     fasta_fai                  = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa.fai'
     chrom_sizes                = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv'
     chr_prefix                 = "chr"
-    //contig_file              = "assets/GRCh38_decoy_ebv_phiX_alt_hla_chr.contig.bed"
+    //contig_file              = "assets/hg38/GRCh38_decoy_ebv_phiX_alt_hla_chr.contig.bed"
 
     // Beagle reference
     beagle_reference           = "${params.data_path}/tools_data/Beagle/1000genomes"
@@ -51,8 +51,8 @@ params {
     centromer_file             = "${params.data_path}/stats/gap_with_centromeres.header.txt"
 
     // HDR estimation
-    blacklist_file             = "assets/artifact.homoDels.potentialArtifacts.hg38.txt"
-    cytobands_file             = "assets/hg38_cytoBand.txt"
+    blacklist_file             = "assets/hg38/artifact.homoDels.potentialArtifacts.hg38.txt"
+    cytobands_file             = "assets/hg38/hg38_cytoBand.txt"
  }
 
 // Perform work directory cleanup when the run has succesfully completed

diff --git a/conf/igenomes.config b/conf/igenomes.config
@@ -28,7 +28,9 @@ params {
             known_indels_tbi      = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz.tbi"
             known_indels_vqsr     = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.b37.vcf.gz'
             mappability           = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem"
-            chr_prefix            = ""  
+            chr_prefix            = ""
+            beagle_ref            = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/beagle"
+            plink_map             = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/plink"
         }
         'GRCh38' {
             chr_dir               = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes"

diff --git a/conf/modules.config b/conf/modules.config
@@ -42,6 +42,14 @@ process {
             mode: params.publish_dir_mode
         ]
     }
+    withName: BEAGLE5_BEAGLE {
+        ext.args   = "impute=false"
+        publishDir = [
+            path:  {"${params.outdir}/${meta.id}"},
+            pattern: "{noting}",
+            mode: params.publish_dir_mode
+        ]
+    }
     withName: 'ADD_HAPLOTYPES' {
         ext.args   = ""
         publishDir = [
@@ -52,7 +60,6 @@ process {
     }
 
     withName: 'ANNOTATE_CNV' {
-        ext.args   = ""
         publishDir = [
             path:  {"${params.outdir}/${meta.id}/anno_cnv"},
             pattern: "*{.tab.gz,.tab.gz.tbi}",
@@ -169,7 +176,7 @@ process {
 // Don't publish results for these processes
 //
 process {    
-    withName: 'GREP_GENOTYPES|GETCHROMSIZES|EMBED_HAPLOTYPES|BEAGLE5_BEAGLE|CREATE_UNPHASED|FAKE_CONTROL|GET_GENOTYPES|MAKE_MOCK|WIN_GENERATOR|CREATE_FAKE_SAMPLES' {
+    withName: 'GREP_GENOTYPES|GETCHROMSIZES|EMBED_HAPLOTYPES|BEAGLE5_BEAGLE|CREATE_UNPHASED|FAKE_CONTROL|GET_GENOTYPES|MAKE_MOCK|WIN_GENERATOR|CREATE_FAKE_SAMPLES|PREPARE_BEAGLE_REF' {
         publishDir = [
             path: { "${params.outdir}/test" },
             enabled: false

diff --git a/conf/test.config b/conf/test.config
@@ -5,7 +5,8 @@
     Defines input files and everything required to run a fast and simple pipeline test.
 
     Use as follows:
-        nextflow run main.nf -profile test,singularity --outdir <OUTDIR>
+        nextflow run main.nf -profile test,singularity
+        nextflow run main.nf -profile test,docker
 
 ----------------------------------------------------------------------------------------
 */
@@ -21,20 +22,17 @@ params {
     max_time   = '8.h'
 
     // Input data
-    input  = 'testdata/samplesheet.csv'
+    input                      = "${projectDir}/testdata/samplesheet.csv"
+    outdir                     = "${projectDir}/results"
 
     // workflow parameters
-    outdir                     = "test_results"
+
     estimatesex                = false
     createbafplots             = false
 
     minHT                      = 0
 
     // correctGC options
-    lowess_f                   = 5
-    scale_factor               = 0
-    covplot_ylims              = 4
-    gc_bias_json_key           = "gc-bias"
     minLim                     = 0.47
     maxLim                     = 0.53
     min_length_purity          = 1000000
@@ -59,58 +57,57 @@ params {
     legacyMode                 = false
 
      // Reference //
-    data_path                  = "/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes"
-    fasta                      = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa'
-    fasta_fai                  = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa.fai'
-    chrom_sizes                = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/stats/hs37d5.fa.chrLenOnlyACGT_realChromosomes.tab'
-    chr_prefix                 = ""
-
-    // Beagle reference
-    beagle_reference           = "${params.data_path}/tools_data/Beagle"
-    beagle_genetic_map         = "${params.data_path}/tools_data/genetic_maps"
-    beagle_ref_ext             = "bref3" // vcf | bref | bref3
-    beagle_map_ext             = "map"
+    genome                     = "GRCh37"
 
     // Annotation files
-    dbsnp_snv                  = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
-    mapability_file            = "${params.data_path}/databases/UCSC/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz"
-    replication_time_file      = "${params.data_path}/databases/ENCODE/ReplicationTime_10cellines_mean_10KB.Rda"
-    gc_content_file            = "${params.data_path}/stats/hg19_GRch37_100genomes_gc_content_10kb.txt"
-    gene_annotation_file       = "${params.data_path}/tools_data/ACEseq/INFORM_druggable_genes.csv"
+    dbsnp_snv                  = "${projectDir}/testdata/annotations/dbsnp_snv.test.vcf.gz"
+    mapability_file            = "${projectDir}/testdata/annotations/mappability5.test.bed.gz"
+    replication_time_file      = "${projectDir}/testdata/annotations/ReplicationTime_10cellines_mean_10KB.Rda"
+    gc_content_file            = "assets/hg19/hg19_GRch37_100genomes_gc_content_10kb.txt"
+    gene_annotation_file       = "${projectDir}/testdata/annotations/druggable_genes.csv"
 
     // get breakpoints/ PSCBS gaps
-    centromer_file             = "${params.data_path}/stats/hg19_gaps.txt"
+    centromer_file             = "assets/hg19/hg19_gaps.txt"
 
     // HDR estimation
-    blacklist_file             = "assets/artifact.homoDels.potentialArtifacts.txt"
-    cytobands_file             = "assets/hg19_cytoBand.txt"
+    blacklist_file             = "assets/hg19/artifact.homoDels.potentialArtifacts.txt"
+    cytobands_file             = "assets/hg19/hg19_cytoBand.txt"
  }
 
-// Perform work directory cleanup when the run has succesfully completed
- cleanup = true
-
-// Reduce the job submit rate to about 5 per second, this way the server won't be bombarded with jobs
-
-singularity {
-  enabled = true
-  cacheDir = "/omics/groups/OE0608/internal/kubran/singularity"
-  autoMounts = true
-  runOptions = "-B /omics/groups -B /omics/odcf/analysis -B /omics/odcf/project -B /omics/odcf/reference_data"
-}
-env {
-    SINGULARITY_CACHEDIR="/omics/groups/OE0608/internal/kubran/singularity"
-    SINGULARITY_LIBRARYDIR="/omics/groups/OE0608/internal/kubran/singularity/library"
+// Enable container engines/virtualisation envs for CI testing
+// only works when specified with the profile ENV
+// otherwise tests can be done with the regular provided profiles
+if (System.getenv('PROFILE')) {
+    if ("$PROFILE" == "docker") {
+        conda.enabled                            = false
+        docker.enabled                           = true
+        docker.userEmulation                     = { params.use_gatk_spark ? false : true }.call()
+        charliecloud.enabled                     = false
+        podman.enabled                           = false
+        shifter.enabled                          = false
+        singularity.enabled                      = false
+    } else if ("$PROFILE" == "singularity") {
+        conda.enabled                            = false
+        params.singularity_pull_docker_container = false
+        singularity.autoMounts                   = true
+        singularity.enabled                      = true
+        charliecloud.enabled                     = false
+        docker.enabled                           = false
+        podman.enabled                           = false
+        shifter.enabled                          = false
+    }
 }
-
 process {
-  executor = 'lsf'
-  scratch = '$SCRATCHDIR/$LSB_JOBID'
-
-}
-executor {
-  name = 'lsf'
-  perTaskReserve = false
-  perJobMemLimit = true
-  submitRateLimit = '30 sec'
-  queueSize=50
-  }
+   withName:'BCFTOOLS_MPILEUP|SAMTOOLS_MPILEUP|ESTIMATE_SEX|ANNOTATE_CNV|EMBED_HAPLOTYPES|DEFINE_BREAKPOINTS|ADD_SVS|ADD_CREST|PSCBS_SEGMENTATION|GENERATE_PLOTS'{
+        cpus            = { check_max( 2 * task.attempt, 'cpus' ) }
+        memory          = { check_max( 6.GB * task.attempt, 'memory' ) }
+   }
+    withName:'MULTIQC'{
+        cpus            = { check_max( 2 * task.attempt, 'cpus' ) }
+        memory          = { check_max( 6.GB * task.attempt, 'memory' ) }
+   } 
+    withName:'BEAGLE5_BEAGLE'{
+        cpus            = { check_max( 4 * task.attempt, 'cpus' ) }
+        memory          = { check_max( 12.GB * task.attempt, 'memory' ) }
+   }
+}