small changes

ghga-de · Dec 21, 2023 · 4a81ce4 · 4a81ce4
1 parent 84b8baa
commit 4a81ce4
Show file tree

Hide file tree

Showing 23 changed files with 215 additions and 55 deletions.
diff --git a/bin/correctGCBias.R b/bin/correctGCBias.R
@@ -574,5 +574,4 @@ png(file=outputfile_rep, width=1000, height=1500, type='cairo')
 
 dev.off()
 
-}
-
+}
diff --git a/bin/correctGCBias_functions.R b/bin/correctGCBias_functions.R
@@ -160,7 +160,8 @@ checkControl <- function(coverage, covIndex){
     #check for second Peak
     maxPeak <- which(dens$y==max(dens$y[zeroCrossings])) 
     secondPeak <- maximaInCross[ which( dens$y[maximaInCross] >= 0.1*dens$y[maxPeak] & dens$y[maximaInCross] != dens$y[maxPeak] )[1] ]
-
+
+    cat(paste0("maxPeak: ", maxPeak)) 
     if (  0.5*( round(2*dens$x[maxPeak])) != 1  | ( ! is.na( secondPeak ) ) ) {
       cat( paste(chr, "warning indicator for contaminated sample or sample swap!\n") )
       if( is.na(secondPeak) ){

diff --git a/bin/estimateHRDScore.sh b/bin/estimateHRDScore.sh
@@ -126,10 +126,6 @@ do
 		exit 2
 	fi
 
-	HRDFile=${pid}_HRDscore_${ploidyFactor}_${tcc}.txt
-	HRD_DETAILS_FILE=${pid}_HRDscore_contributingSegments_${ploidyFactor}_${tcc}.txt
-	LST_DETAILS_FILE=${pid}_LSTscore_contributingSegments_${ploidyFactor}_${tcc}.CentromerReduced.txt
-	MERGED_REDUCED_FILE=${pid}_comb_pro_extra${ploidyFactor}_${tcc}.smoothed.CentromerReduced.txt
 	echo "before hdr estimation"
 	HRD_estimation.R \
 		$combProFileNoArtifacts \
@@ -138,10 +134,10 @@ do
 		$ploidy \
 		$tcc \
 		$pid \
-		${HRDFile}.tmp \
-		${HRD_DETAILS_FILE}.tmp \
-		${LST_DETAILS_FILE}.tmp \
-		${MERGED_REDUCED_FILE}.tmp \
+		${pid}_HRDscore_${ploidyFactor}_${tcc}.txt \
+		${pid}_HRDscore_contributingSegments_${ploidyFactor}_${tcc}.txt \
+		${pid}_LSTscore_contributingSegments_${ploidyFactor}_${tcc}.CentromerReduced.txt \
+		${pid}_comb_pro_extra${ploidyFactor}_${tcc}.smoothed.CentromerReduced.txt \
 		${centromers} \
 		${cytobandsFile} \
 		.
@@ -162,11 +158,6 @@ do
 		exit 2
 	fi
 
-	mv ${HRDFile}.tmp ${HRDFile}
-	mv ${HRD_DETAILS_FILE}.tmp ${HRD_DETAILS_FILE}
-	mv ${LST_DETAILS_FILE}.tmp ${LST_DETAILS_FILE}
-	mv ${MERGED_REDUCED_FILE}.tmp ${MERGED_REDUCED_FILE}
-	rm ${combProFile}.tmp
 done
 if [[ "$?" != 0 ]]
 then

diff --git a/bin/manual_pruning.R b/bin/manual_pruning.R
@@ -759,6 +759,7 @@ test_new$maxStop =NA
 swapAlleles <- function(segments, data, chr, blockPre, blockPost){
 
   blockFile <- paste0( blockPre, chr, ".", blockPost)
+  cat(paste0("blockFile: ",blockFile, "\n\n"))
   blocks <- read.table( blockFile, header=F)
   colnames(blocks) <- c('chr', 'start', 'end', 'length')
 
@@ -896,6 +897,7 @@ for (chr in  seq_len(maxChr) ) {
 		if ( chr <= max(chromosomes) ) {
 		#adjust allele frequencies
 			dataAll[[chr]]$adjusted <- NA
+      cat(paste0("chr: ",chr, "\n\n"))
 			dataAll[[chr]] <- swapAlleles( test_new[selSeg,], dataAll[[chr]], chr, blockPre, blockSuf)
 
 			selRem <- which( dataAll[[chr]]$betaN > 0.3 & dataAll[[chr]]$betaN < 0.7 & is.na(dataAll[[chr]]$adjusted) )

diff --git a/bin/python_modules/Options.pyc b/bin/python_modules/Options.pyc
diff --git a/bin/python_modules/Tabfile.pyc b/bin/python_modules/Tabfile.pyc
diff --git a/bin/python_modules/__init__.pyc b/bin/python_modules/__init__.pyc
diff --git a/conf/test.config b/conf/test.config
@@ -5,25 +5,112 @@
     Defines input files and everything required to run a fast and simple pipeline test.
 
     Use as follows:
-        nextflow run nf-core/aceseq -profile test,<docker/singularity> --outdir <OUTDIR>
+        nextflow run main.nf -profile test,singularity --outdir <OUTDIR>
 
 ----------------------------------------------------------------------------------------
 */
 
 params {
-    config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_contact     = 'Kübra Narcı kuebra.narci@dkfz-heidelberg.de'
+    config_profile_name        = 'Test profiler'
+    config_profile_description = 'Test dataset for to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '6.GB'
-    max_time   = '6.h'
+    max_cpus   = 16
+    max_memory = '100.GB'
+    max_time   = '8.h'
 
     // Input data
-    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+    input  = 'testdata/samplesheet.csv'
+
+    // workflow parameters
+    outdir                     = "test_results"
+    estimatesex                = false
+    createbafplots             = false
+
+    minHT                      = 0
+
+    // correctGC options
+    lowess_f                   = 5
+    scale_factor               = 0
+    covplot_ylims              = 4
+    gc_bias_json_key           = "gc-bias"
+    minLim                     = 0.47
+    maxLim                     = 0.53
+    min_length_purity          = 1000000
+    min_hetSNPs_purity         = 0       // default 500 !!
+    dh_stop                    = "max"
+    min_length_dh_stop         = 1000000
+    dh_zero                    = "no"
+    purity_min                 = 0.3
+    purity_max                 = 1.0
+    ploidy_min                 = 1.0
+    ploidy_max                 = 6.5
+    local_minium_upper_boundary_shift = 0.1
+
+    // clusterAndPruneSegments
+    min_seg_length_prune       = 100
+    min_num_SNPs               = 1
+    clustering                 = "no"
+    min_cluster_number         = 1
+    min_membership             = 0.001
+    min_distance               = 0.0005
+
+    legacyMode                 = false
+
+     // Reference //
+    data_path                  = "/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes"
+    fasta                      = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa'
+    fasta_fai                  = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa.fai'
+    chrom_sizes                = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/stats/hs37d5.fa.chrLenOnlyACGT_realChromosomes.tab'
+    chr_prefix                 = ""
+
+    // Beagle reference
+    beagle_reference           = "${params.data_path}/tools_data/Beagle"
+    beagle_genetic_map         = "${params.data_path}/tools_data/genetic_maps"
+    beagle_ref_ext             = "bref3" // vcf | bref | bref3
+    beagle_map_ext             = "map"
+
+    // Annotation files
+    dbsnp_snv                  = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
+    mapability_file            = "${params.data_path}/databases/UCSC/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz"
+    replication_time_file      = "${params.data_path}/databases/ENCODE/ReplicationTime_10cellines_mean_10KB.Rda"
+    gc_content_file            = "${params.data_path}/stats/hg19_GRch37_100genomes_gc_content_10kb.txt"
+    gene_annotation_file       = "${params.data_path}/tools_data/ACEseq/INFORM_druggable_genes.csv"
+
+    // get breakpoints/ PSCBS gaps
+    centromer_file             = "${params.data_path}/stats/hg19_gaps.txt"
+
+    // HDR estimation
+    blacklist_file             = "assets/artifact.homoDels.potentialArtifacts.txt"
+    cytobands_file             = "assets/hg19_cytoBand.txt"
+ }
+
+// Perform work directory cleanup when the run has succesfully completed
+ cleanup = true
+
+// Reduce the job submit rate to about 5 per second, this way the server won't be bombarded with jobs
+
+singularity {
+  enabled = true
+  cacheDir = "/omics/groups/OE0608/internal/kubran/singularity"
+  autoMounts = true
+  runOptions = "-B /omics/groups -B /omics/odcf/analysis -B /omics/odcf/project -B /omics/odcf/reference_data"
+}
+env {
+    SINGULARITY_CACHEDIR="/omics/groups/OE0608/internal/kubran/singularity"
+    SINGULARITY_LIBRARYDIR="/omics/groups/OE0608/internal/kubran/singularity/library"
+}
+
+process {
+  executor = 'lsf'
+  scratch = '$SCRATCHDIR/$LSB_JOBID'
 
-    // Genome references
-    genome = 'R64-1-1'
 }
+executor {
+  name = 'lsf'
+  perTaskReserve = false
+  perJobMemLimit = true
+  submitRateLimit = '30 sec'
+  queueSize=50
+  }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -1,24 +1,98 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Nextflow config file for running full-size tests
+    Nextflow config file for running minimal tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Defines input files and everything required to run a full size pipeline test.
+    Defines input files and everything required to run a fast and simple pipeline test.
 
     Use as follows:
-        nextflow run nf-core/aceseq -profile test_full,<docker/singularity> --outdir <OUTDIR>
+        nextflow run main.nf -profile test_full,singularity --outdir <OUTDIR>
 
 ----------------------------------------------------------------------------------------
 */
 
 params {
-    config_profile_name        = 'Full test profile'
-    config_profile_description = 'Full test dataset to check pipeline function'
+    config_profile_contact     = 'Kübra Narcı kuebra.narci@dkfz-heidelberg.de'
+    config_profile_name        = 'Test profiler'
+    config_profile_description = 'Test dataset for to check pipeline function'
 
-    // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 16
+    max_memory = '100.GB'
+    max_time   = '8.h'
+
+    // Input data
+    input  = 'assets/samplesheet_37_full.csv'
+
+    // workflow parameters
+    outdir                     = "test_full"
+
+    // correctGC options
+    minLim                     = 0.47
+    maxLim                     = 0.53
+    min_length_purity          = 1000000
+    min_hetSNPs_purity         = 0       // default 500 !!
+    dh_stop                    = "max"
+    min_length_dh_stop         = 1000000
+    dh_zero                    = "no"
+    purity_min                 = 0.3
+    purity_max                 = 1.0
+    ploidy_min                 = 1.0
+    ploidy_max                 = 6.5
+    local_minium_upper_boundary_shift = 0.1
+
+     // Reference //
+    data_path                  = "/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes"
+    fasta                      = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa'
+    fasta_fai                  = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa.fai'
+    chrom_sizes                = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/stats/hs37d5.fa.chrLenOnlyACGT_realChromosomes.tab'
+    chr_prefix                 = ""
+
+    // Beagle reference
+    beagle_reference           = "${params.data_path}/tools_data/Beagle"
+    beagle_genetic_map         = "${params.data_path}/tools_data/genetic_maps"
+    beagle_ref_ext             = "bref3" // vcf | bref | bref3
+    beagle_map_ext             = "map"
+
+    // Annotation files
+    dbsnp_snv                  = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
+    mapability_file            = "${params.data_path}/databases/UCSC/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz"
+    replication_time_file      = "${params.data_path}/databases/ENCODE/ReplicationTime_10cellines_mean_10KB.Rda"
+    gc_content_file            = "${params.data_path}/stats/hg19_GRch37_100genomes_gc_content_10kb.txt"
+    gene_annotation_file       = "${params.data_path}/tools_data/ACEseq/INFORM_druggable_genes.csv"
+
+    // get breakpoints/ PSCBS gaps
+    centromer_file             = "${params.data_path}/stats/hg19_gaps.txt"
+
+    // HDR estimation
+    blacklist_file             = "assets/artifact.homoDels.potentialArtifacts.txt"
+    cytobands_file             = "assets/hg19_cytoBand.txt"
+ }
+
+// Perform work directory cleanup when the run has succesfully completed
+ cleanup = true
+
+// Reduce the job submit rate to about 5 per second, this way the server won't be bombarded with jobs
+
+singularity {
+  enabled = true
+  cacheDir = "/omics/groups/OE0608/internal/kubran/singularity"
+  autoMounts = true
+  runOptions = "-B /omics/groups -B /omics/odcf/analysis -B /omics/odcf/project -B /omics/odcf/reference_data"
+}
+env {
+    SINGULARITY_CACHEDIR="/omics/groups/OE0608/internal/kubran/singularity"
+    SINGULARITY_LIBRARYDIR="/omics/groups/OE0608/internal/kubran/singularity/library"
+}
+
+process {
+  executor = 'lsf'
+  scratch = '$SCRATCHDIR/$LSB_JOBID'
 
-    // Genome references
-    genome = 'R64-1-1'
 }
+executor {
+  name = 'lsf'
+  perTaskReserve = false
+  perJobMemLimit = true
+  submitRateLimit = '30 sec'
+  queueSize=50
+  }
diff --git a/modules/local/annotate_cnv.nf b/modules/local/annotate_cnv.nf
@@ -1,6 +1,6 @@
 // This needs to run per cnv.tab.gz !
 process ANNOTATE_CNV {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals"
     label 'process_high_cpu_low_memory'
 
     conda     (params.enable_conda ? "" : null)

diff --git a/modules/local/cluster_segments.nf b/modules/local/cluster_segments.nf
@@ -12,10 +12,10 @@ process CLUSTER_SEGMENTS {
     val(chr_prefix)
 
     output:
-    tuple val(meta), path('*normal.txt')                                      , emit: clustered_segments   
-    tuple val(meta), path('*all_seg_2.txt.gz'), path('*all_seg_2.txt.gz.tbi') , emit: snp_update2
-    tuple val(meta), path('*.pdf')
-    path  "versions.yml"                                                       , emit: versions
+    tuple val(meta), path('*normal.txt')                              , emit: clustered_segments   
+    tuple val(meta), path('*seg_2.txt.gz'), path('*seg_2.txt.gz.tbi') , emit: snp_update2
+    tuple val(meta), path('*.pdf')                                    , optional: true 
+    path  "versions.yml"                                              , emit: versions
 
     when:
     task.ext.when == null || task.ext.when

diff --git a/modules/local/create_fake_samples.nf b/modules/local/create_fake_samples.nf
@@ -1,5 +1,5 @@
 process CREATE_FAKE_SAMPLES {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals"
     label 'process_low'
 
     conda (params.enable_conda ? "" : null)

diff --git a/modules/local/embed_haplotypes.nf b/modules/local/embed_haplotypes.nf
@@ -1,5 +1,5 @@
 process EMBED_HAPLOTYPES {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals"
     label 'process_high_cpu_low_memory'
 
     conda (params.enable_conda ? "" : null)

diff --git a/modules/local/fake_control.nf b/modules/local/fake_control.nf
@@ -1,5 +1,5 @@
 process FAKE_CONTROL {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals"
     label 'process_single'
 
     conda     (params.enable_conda ? "" : null)

diff --git a/modules/local/gc_bias.nf b/modules/local/gc_bias.nf
@@ -1,10 +1,11 @@
+//only works with v0 !
 process GC_BIAS {
     tag "$meta.id"
     label 'process_single'
 
     conda     (params.enable_conda ? "" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://kubran/odcf_aceseqcalling:v5':'kubran/odcf_aceseqcalling:v5' }"
+        'docker://kubran/odcf_aceseqcalling:v0':'kubran/odcf_aceseqcalling:v0' }"
 
     input:
     tuple val(meta), path(cnv_pos)

diff --git a/modules/local/group_haplotypes.nf b/modules/local/group_haplotypes.nf
@@ -1,6 +1,6 @@
 process GROUP_HAPLOTYPES {
-    tag "$meta.id"
-    label 'process_single'
+    tag "$meta.id chr$intervals"
+    label 'process_low'
 
     conda (params.enable_conda ? "" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/local/merge_cnv.nf b/modules/local/merge_cnv.nf
@@ -7,7 +7,7 @@ process MERGE_CNV {
         'docker://kubran/odcf_aceseqcalling:v5':'kubran/odcf_aceseqcalling:v5' }"
 
     input:
-    tuple val(meta)        , path(cnv)
+    tuple val(meta) , path(cnv)
     val(chr_prefix)
 
     output:

diff --git a/modules/local/win_generator.nf b/modules/local/win_generator.nf
@@ -1,5 +1,5 @@
 process WIN_GENERATOR {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals" 
     label 'process_low'
 
     conda (params.enable_conda ? "" : null)

diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf
@@ -1,5 +1,5 @@
 process BCFTOOLS_MPILEUP {
-    tag "$meta.id"
+    tag "$meta.id chr$intervals"
     label 'process_medium'
 
     conda (params.enable_conda ? "bioconda::bcftools=1.9" : null)