Skip to content

Commit

Permalink
14 create test dataset (#23)
Browse files Browse the repository at this point in the history
* Delete seqc2_testdata_aceseq directory

* ACE-seq test dataset

* testdataset additions

* get beagle ref automatically

* annotations testdata

* testdata

* small changes

* beagle ref hg37 added to igenomes

* testdata is running hg19
  • Loading branch information
kubranarci authored Jan 23, 2024
1 parent 4a81ce4 commit f7f8a40
Show file tree
Hide file tree
Showing 65 changed files with 286,560 additions and 269 deletions.
File renamed without changes.
285,606 changes: 285,606 additions & 0 deletions assets/hg19/hg19_GRch37_100genomes_gc_content_10kb.txt

Large diffs are not rendered by default.

File renamed without changes.
459 changes: 459 additions & 0 deletions assets/hg19/hg19_gaps.txt

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
9 changes: 7 additions & 2 deletions bin/estimateHRDScore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ set -euo pipefail
# Copyright (c) 2017 The ACEseq workflow developers.
# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt).

usage() { echo "Usage: $0 [-p pid] [-i jsonfile] [-m legacyMode] [-b blacklistFileName] [-s sexfile] [-c centromers] [-y cytobandsFile]" 1>&2; exit 1; }
usage() { echo "Usage: $0 [-p pid] [-i jsonfile] [-m legacyMode] [-b blacklistFileName] [-s sexfile] [-c centromers] [-y cytobandsFile] [-x chrprefix]" 1>&2; exit 1; }

while [[ $# -gt 0 ]]
do
Expand Down Expand Up @@ -43,6 +43,11 @@ do
shift # past argument
shift # past value
;;
-x)
chrprefix=$2
shift # past argument
shift # past value
;;
esac
done

Expand Down Expand Up @@ -110,7 +115,7 @@ do
fi

#this file could be written out and sorted according to chromosomes
smoothData.py -f $combProFile.tmp -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp && \
smoothData.py -f $combProFile.tmp -p $chrprefix -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp && \
removeBreakpoints.py -f $combProFile.tmp -o $combProFile.tmp.tmp && mv $combProFile.tmp.tmp $combProFile.tmp
if [[ "$?" != 0 ]]
then
Expand Down
52 changes: 52 additions & 0 deletions bin/prepare_beagle_ref.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
set -euo pipefail
# Copyright (c) 2017 The ACEseq workflow developers.
# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt).

usage() { echo "Usage: $0 [-r ref -m memory]" 1>&2; exit 1; }

while [[ $# -gt 0 ]]
do
key=$1
case $key in
-r)
ref=$2
shift # past argument
shift # past value
;;
-m)
memory=$2
shift # past argument
shift # past value
;;
esac
done

if [[ ${ref} == 'hg37' ]]; then
for i in {1..22}; do
echo $i
echo https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chr$i.1kg.phase3.v5a.b37.bref3
wget https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chr$i.1kg.phase3.v5a.b37.bref3
done

wget https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chrX.1kg.phase3.v5a.b37.bref3

wget https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh37.map.zip

unzip plink.GRCh37.map.zip

else

for i in {1..22}; do
echo $i
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
(zcat ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | head -n 300 | grep "#" ; zcat ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | grep -v "#" | sed 's/^/chr/') | java -Xmx${memory}M -jar beagle.jar > ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3
done

wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz

wget https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip

unzip plink.GRCh38.map.zip

fi

Binary file modified bin/python_modules/Options.pyc
Binary file not shown.
Binary file modified bin/python_modules/Tabfile.pyc
Binary file not shown.
Binary file modified bin/python_modules/__init__.pyc
Binary file not shown.
18 changes: 12 additions & 6 deletions bin/smoothData.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
parser.add_argument( '--file', '-f', type=file, help="segment file with copy number information" )
parser.add_argument( '--out', '-o', default=sys.stdout, type=str, help='outputfile' )
parser.add_argument( '--maxDistToNext', '-m', default=1000, type=int, help='maximum allowed distance between segments to be merged' )
parser.add_argument( '--chr_prefix', '-p', default='chr', type=str, help='' )
# maxDistToNext is used only used for first segments [in chromosome | after large gap]


Expand All @@ -26,6 +27,7 @@

args = parser.parse_args()
maxDistToNext=args.maxDistToNext
chr_prefix = args.chr_prefix
out = args.out
cutoff = 0.5
maxLen = 3000000
Expand Down Expand Up @@ -159,8 +161,6 @@ def merge_first_segment(prior_line, newline):
prior_line["length"] = int(int(prior_line["end"]) - int(prior_line["start"]) +1 )
return(prior_line)



if __name__ == "__main__":
#read first two lines of file before looping over all lines
prior_line = infile.readline()
Expand Down Expand Up @@ -234,7 +234,13 @@ def merge_first_segment(prior_line, newline):
prior_line = merge_gap(prior_line, newline)
newline=None

#print last line(s)
out.write( "\t".join( str(prior_line[key]) for key in infile.header ) + "\n" )
if newline:
out.write( "\t".join( str(newline[key]) for key in infile.header ) + "\n" )
if (chr_prefix == "chr"):
#print last line(s)
out.write( "\t".join( str(prior_line[key]) for key in infile.header ) + "\n" )
if newline:
out.write( "\t".join( str(newline[key]) for key in infile.header ) + "\n" )
else:
#print last line(s)
out.write( "\t".join( str(prior_line[0][key]) for key in infile.header ) + "\n" )
if newline:
out.write( "\t".join( str(newline[0][key]) for key in infile.header ) + "\n" )
14 changes: 7 additions & 7 deletions conf/dkfz_cluster_hg37.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ params {
max_time = '48.h'

// Input data
input = 'seqc2_testdata_chr21_hg37/samplesheet_37.csv'
input = 'testdata/samplesheet.csv'

// workflow parameters
outdir = "results37"
Expand All @@ -47,10 +47,10 @@ params {
chr_prefix = ""

// Beagle reference
beagle_reference = "${params.data_path}/tools_data/Beagle"
beagle_genetic_map = "${params.data_path}/tools_data/genetic_maps"
beagle_ref_ext = "bref3" // vcf | bref | bref3
beagle_map_ext = "map"
beagle_ref = "${params.data_path}/tools_data/Beagle"
plink_map = "${params.data_path}/tools_data/genetic_maps"
//beagle_ref_ext = "bref3" // vcf | bref | bref3
//beagle_map_ext = "map"

// Annotation files
dbsnp_snv = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
Expand All @@ -63,8 +63,8 @@ params {
centromer_file = "${params.data_path}/stats/hg19_gaps.txt"

// HDR estimation
blacklist_file = "assets/artifact.homoDels.potentialArtifacts.txt"
cytobands_file = "assets/hg19_cytoBand.txt"
blacklist_file = "assets/hg19/artifact.homoDels.potentialArtifacts.txt"
cytobands_file = "assets/hg19/hg19_cytoBand.txt"
}

// Perform work directory cleanup when the run has succesfully completed
Expand Down
6 changes: 3 additions & 3 deletions conf/dkfz_cluster_hg38.config
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ params {
fasta_fai = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa.fai'
chrom_sizes = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv'
chr_prefix = "chr"
//contig_file = "assets/GRCh38_decoy_ebv_phiX_alt_hla_chr.contig.bed"
//contig_file = "assets/hg38/GRCh38_decoy_ebv_phiX_alt_hla_chr.contig.bed"

// Beagle reference
beagle_reference = "${params.data_path}/tools_data/Beagle/1000genomes"
Expand All @@ -51,8 +51,8 @@ params {
centromer_file = "${params.data_path}/stats/gap_with_centromeres.header.txt"

// HDR estimation
blacklist_file = "assets/artifact.homoDels.potentialArtifacts.hg38.txt"
cytobands_file = "assets/hg38_cytoBand.txt"
blacklist_file = "assets/hg38/artifact.homoDels.potentialArtifacts.hg38.txt"
cytobands_file = "assets/hg38/hg38_cytoBand.txt"
}

// Perform work directory cleanup when the run has succesfully completed
Expand Down
4 changes: 3 additions & 1 deletion conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ params {
known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz.tbi"
known_indels_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.b37.vcf.gz'
mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem"
chr_prefix = ""
chr_prefix = ""
beagle_ref = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/beagle"
plink_map = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/plink"
}
'GRCh38' {
chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes"
Expand Down
11 changes: 9 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ process {
mode: params.publish_dir_mode
]
}
withName: BEAGLE5_BEAGLE {
ext.args = "impute=false"
publishDir = [
path: {"${params.outdir}/${meta.id}"},
pattern: "{noting}",
mode: params.publish_dir_mode
]
}
withName: 'ADD_HAPLOTYPES' {
ext.args = ""
publishDir = [
Expand All @@ -52,7 +60,6 @@ process {
}

withName: 'ANNOTATE_CNV' {
ext.args = ""
publishDir = [
path: {"${params.outdir}/${meta.id}/anno_cnv"},
pattern: "*{.tab.gz,.tab.gz.tbi}",
Expand Down Expand Up @@ -169,7 +176,7 @@ process {
// Don't publish results for these processes
//
process {
withName: 'GREP_GENOTYPES|GETCHROMSIZES|EMBED_HAPLOTYPES|BEAGLE5_BEAGLE|CREATE_UNPHASED|FAKE_CONTROL|GET_GENOTYPES|MAKE_MOCK|WIN_GENERATOR|CREATE_FAKE_SAMPLES' {
withName: 'GREP_GENOTYPES|GETCHROMSIZES|EMBED_HAPLOTYPES|BEAGLE5_BEAGLE|CREATE_UNPHASED|FAKE_CONTROL|GET_GENOTYPES|MAKE_MOCK|WIN_GENERATOR|CREATE_FAKE_SAMPLES|PREPARE_BEAGLE_REF' {
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
Expand Down
101 changes: 49 additions & 52 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run main.nf -profile test,singularity --outdir <OUTDIR>
nextflow run main.nf -profile test,singularity
nextflow run main.nf -profile test,docker

----------------------------------------------------------------------------------------
*/
Expand All @@ -21,20 +22,17 @@ params {
max_time = '8.h'

// Input data
input = 'testdata/samplesheet.csv'
input = "${projectDir}/testdata/samplesheet.csv"
outdir = "${projectDir}/results"

// workflow parameters
outdir = "test_results"

estimatesex = false
createbafplots = false

minHT = 0

// correctGC options
lowess_f = 5
scale_factor = 0
covplot_ylims = 4
gc_bias_json_key = "gc-bias"
minLim = 0.47
maxLim = 0.53
min_length_purity = 1000000
Expand All @@ -59,58 +57,57 @@ params {
legacyMode = false

// Reference //
data_path = "/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes"
fasta = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa'
fasta_fai = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/sequence/1KGRef/hs37d5.fa.fai'
chrom_sizes = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/stats/hs37d5.fa.chrLenOnlyACGT_realChromosomes.tab'
chr_prefix = ""

// Beagle reference
beagle_reference = "${params.data_path}/tools_data/Beagle"
beagle_genetic_map = "${params.data_path}/tools_data/genetic_maps"
beagle_ref_ext = "bref3" // vcf | bref | bref3
beagle_map_ext = "map"
genome = "GRCh37"

// Annotation files
dbsnp_snv = "${params.data_path}/databases/dbSNP/dbSNP_135/00-All.SNV.vcf.gz"
mapability_file = "${params.data_path}/databases/UCSC/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz"
replication_time_file = "${params.data_path}/databases/ENCODE/ReplicationTime_10cellines_mean_10KB.Rda"
gc_content_file = "${params.data_path}/stats/hg19_GRch37_100genomes_gc_content_10kb.txt"
gene_annotation_file = "${params.data_path}/tools_data/ACEseq/INFORM_druggable_genes.csv"
dbsnp_snv = "${projectDir}/testdata/annotations/dbsnp_snv.test.vcf.gz"
mapability_file = "${projectDir}/testdata/annotations/mappability5.test.bed.gz"
replication_time_file = "${projectDir}/testdata/annotations/ReplicationTime_10cellines_mean_10KB.Rda"
gc_content_file = "assets/hg19/hg19_GRch37_100genomes_gc_content_10kb.txt"
gene_annotation_file = "${projectDir}/testdata/annotations/druggable_genes.csv"

// get breakpoints/ PSCBS gaps
centromer_file = "${params.data_path}/stats/hg19_gaps.txt"
centromer_file = "assets/hg19/hg19_gaps.txt"

// HDR estimation
blacklist_file = "assets/artifact.homoDels.potentialArtifacts.txt"
cytobands_file = "assets/hg19_cytoBand.txt"
blacklist_file = "assets/hg19/artifact.homoDels.potentialArtifacts.txt"
cytobands_file = "assets/hg19/hg19_cytoBand.txt"
}

// Perform work directory cleanup when the run has succesfully completed
cleanup = true

// Reduce the job submit rate to about 5 per second, this way the server won't be bombarded with jobs

singularity {
enabled = true
cacheDir = "/omics/groups/OE0608/internal/kubran/singularity"
autoMounts = true
runOptions = "-B /omics/groups -B /omics/odcf/analysis -B /omics/odcf/project -B /omics/odcf/reference_data"
}
env {
SINGULARITY_CACHEDIR="/omics/groups/OE0608/internal/kubran/singularity"
SINGULARITY_LIBRARYDIR="/omics/groups/OE0608/internal/kubran/singularity/library"
// Enable container engines/virtualisation envs for CI testing
// only works when specified with the profile ENV
// otherwise tests can be done with the regular provided profiles
if (System.getenv('PROFILE')) {
if ("$PROFILE" == "docker") {
conda.enabled = false
docker.enabled = true
docker.userEmulation = { params.use_gatk_spark ? false : true }.call()
charliecloud.enabled = false
podman.enabled = false
shifter.enabled = false
singularity.enabled = false
} else if ("$PROFILE" == "singularity") {
conda.enabled = false
params.singularity_pull_docker_container = false
singularity.autoMounts = true
singularity.enabled = true
charliecloud.enabled = false
docker.enabled = false
podman.enabled = false
shifter.enabled = false
}
}

process {
executor = 'lsf'
scratch = '$SCRATCHDIR/$LSB_JOBID'

}
executor {
name = 'lsf'
perTaskReserve = false
perJobMemLimit = true
submitRateLimit = '30 sec'
queueSize=50
}
withName:'BCFTOOLS_MPILEUP|SAMTOOLS_MPILEUP|ESTIMATE_SEX|ANNOTATE_CNV|EMBED_HAPLOTYPES|DEFINE_BREAKPOINTS|ADD_SVS|ADD_CREST|PSCBS_SEGMENTATION|GENERATE_PLOTS'{
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
}
withName:'MULTIQC'{
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
}
withName:'BEAGLE5_BEAGLE'{
cpus = { check_max( 4 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
}
}
Loading

0 comments on commit f7f8a40

Please sign in to comment.