Skip to content

Commit

Permalink
Merge pull request #75 from tgen/develop
Browse files Browse the repository at this point in the history
  • Loading branch information
bryce-turner authored Mar 27, 2023
2 parents 7c7a990 + 05888e9 commit 45e48f6
Show file tree
Hide file tree
Showing 35 changed files with 231 additions and 134 deletions.
Empty file modified bastien/create_canfam3toRosCfam_liftover_chain.sh
100644 → 100755
Empty file.
Empty file modified bastien/create_genderCheck_SNP_list.sh
100644 → 100755
Empty file.
Empty file modified bastien/create_samtools_stats_non_N_region_file.sh
100644 → 100755
Empty file.
Empty file modified bastien/create_snp_database.sh
100644 → 100755
Empty file.
3 changes: 2 additions & 1 deletion coyote/capture_kits.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
SC2,Agilent_SureSelect_Canine_Exon_V2,agilent_canine_exonV2_targets_sorted.bed,agilent_canine_exonV2_targets_sorted.bed
XT2,Agilent_SureSelect_XT2_Vidium_v1.0,Vidium_v1.0_exome_Covered_nochr_noheader.bed,Vidium_v1.0_exome_Covered_nochr_noheader.bed
XT2,Agilent_SureSelect_XT2_Vidium_v1.0,Vidium_v1.0_exome_Covered_nochr_noheader.bed,Vidium_v1.0_exome_Covered_nochr_noheader.bed
ACE,Twist_Alliance_Canine_Exome,Twist_Alliance_Canine_Exome_canFam3_all_targets_covered_nochr_subMT_chrUnMod.bed,Twist_Alliance_Canine_Exome_canFam3_all_targets_covered_nochr_subMT_chrUnMod.bed
Empty file modified coyote/create_samtools_stats_non_N_region_file.sh
100644 → 100755
Empty file.
Empty file modified coyote/create_snp_database.sh
100644 → 100755
Empty file.
Empty file modified monte/create_canfam3tocanfam6_liftover_chain.sh
100644 → 100755
Empty file.
Empty file modified monte/create_genderCheck_SNP_list.sh
100644 → 100755
Empty file.
Empty file modified monte/create_samtools_stats_non_N_region_file.sh
100644 → 100755
Empty file.
Empty file modified monte/create_snp_database.sh
100644 → 100755
Empty file.
4 changes: 3 additions & 1 deletion phoenix/capture_kits.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ CR2,Agilent_SureSelect_CREv2,Agilent_SureSelect_CREv2_cliRes_hs37d5_all_hg38.bed
E61,E61_TruSeqExome_aka_NexteraExpandedExome,nexterarapidcapture_expandedexome_targetedregions_b38.bed,nexterarapidcapture_expandedexome_targetedregions_b38.bed
IE2,IDT_xGen_Exome_Research_Panel_v2,xgen-exome-research-panel-v2-targets-hg38.bed,xgen-exome-research-panel-v2-probes-hg38.bed
KHE,KAPA_HyperExome,KAPA_HyperExome_hg38_capture_targets.bed,KAPA_HyperExome_hg38_capture_targets.bed
NE3,SeqCap_EZ_Exome_v3,SeqCap_EZ_Exome_v3_hg38_primary_targets.bed,SeqCap_EZ_Exome_v3_hg38_primary_targets.bed
S4U,Agilent_SureSelect_DNA_SureSelectXT_Human_All_Exon_V4_UTR,S03723424_S4U_hg19_to_hg38_liftover_results.bed,S03723424_S4U_hg19_to_hg38_liftover_results.bed
S5U,Agilent_SureSelect_DNA_SureSelectXT_Human_All_Exon_V5_UTRs,S04380219_Regions.bed,S04380219_Regions.bed
S5X,Agilent_SureSelect_DNA_SureSelectXT_Human_All_Exon_V5,S04380110_Regions.bed,S04380110_Regions.bed
Expand All @@ -19,9 +20,10 @@ STX,Agilent_SureSelect_DNA_SureSelect_V5_Strexome,STX_Agilent_SureSelect_DNA_Sur
TB1,Twist_Custom_Barthel_1,clean_targets.bed,clean_targets.bed
TC1,Twist_Custom_CHIP_Panel1,TC1_TE-99640003_merged_targets.bed,TC1_TE-99640003_merged_probes.bed
TCE,Twist_Comprehensive_Exome,Twist_ComprehensiveExome_targets_hg38.bed,Twist_ComprehensiveExome_targets_hg38.bed
TOE,Twist_Core_Exome,Twist_Core_Exome_targets_hg38.bed,Twist_Core_Exome_targets_hg38.bed
TE2,Twist_Exome2.0,hg38_Twist_exome_2_1_annotated_targets.bed,hg38_Twist_exome_2_1_annotated_targets.bed
TK1,Twist_Custom_Keats_1_MM_R_ISS,concat_1X_4X_hg38_revised_cleaned_merged_d120_padded60bp.bed,concat_1X_4X_hg38_revised_cleaned_merged_d120_padded60bp.bed
TK2,Twist_Custom_Keats_2_MM_R_ISS_v2,TK2_TE-92573302_merged_targets.bed,TK2_TE-92573302_merged_probes.bed
TOE,Twist_Core_Exome,Twist_Core_Exome_targets_hg38.bed,Twist_Core_Exome_targets_hg38.bed
TS1,TS1_Illumina_TruSeqDNA_Exome_v1.2,truseq-dna-exome-target-regions-hg19-liftover2-hg38.bed,Exome-Probes-Manifest-v1-2_hg19-liftover2-hg38.bed
TSC,Agilent_Custom_SureSelect_DNA_SureSelect_Human_All_Exon_V6,agilent_custom_sureselect_all_exon_v6_tsc_grch38.bed,agilent_custom_sureselect_all_exon_v6_tsc_grch38.bed
UTJ,TCE_CustomCaptureTwist_UTJMM,TCE_CCT_UTJMM_SMM_validation_v1.bed,TCE_CCT_UTJMM_SMM_validation_v1.bed
Expand Down
Empty file modified phoenix/create_disease_specific_resources.sh
100644 → 100755
Empty file.
30 changes: 22 additions & 8 deletions reporting_tools/collect_study_timing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,28 @@

# usage: collect_study_timing.sh <List_of_Projects> <Study>

# Requires: jetstream in the user path?
# Requires: jetstream installed to whichever version of python is used
# Potential enhancements:
# - singularity would benefit from dynamic arguments, especially for binding

set -e

trap "echo jetstream not found" ERR

jetstreams_python=$(head -n1 $(which jetstream) | grep -o "/.*")

# Unsetting the trap since it is no longer valid
trap - ERR

# Load needed modules
#module load R/3.4.4 (using native R on merck as there is an X11/ciaro error with this version), and merckx doesn't see the phoenix version with fix
module load python/3.6.0
module load singularity/3.7.1-phoenix

# make folder for summary
mkdir -p timing_summary

# enter summary folder and create initial study template files
cd timing_summary
echo -e Project"\t"Group"\t"Tasks"\t"Total_CPU_Hours"\t"Max_Task_CPU_Hours"\t"Total_Elapsed_Hours"\t"Max_Task_Elapsed_Hours"\t"PCT_CPU_Hours"\t"PCT_Elapsed_Hours > study_task_summary.txt
echo -e Project"\t"Tags"\t"Tasks"\t"Total_CPU_Hours"\t"Max_Task_CPU_Hours"\t"Total_Elapsed_Hours"\t"Max_Task_Elapsed_Hours"\t"PCT_CPU_Hours"\t"PCT_Elapsed_Hours > study_task_summary.txt
echo -e Project"\t"Tasks"\t"Total_CPU_Hours"\t"Total_Elapsed_Hours > study_project_summary.txt

echo
Expand All @@ -28,11 +38,14 @@ do
cd ../${project}

# Generate timing result
python3 /home/tgenjetstream/git_repositories/jetstream_resources/reporting_tools/report_cpu_usage.py > ../timing_summary/${project}_timing.txt
$jetstreams_python /home/tgenjetstream/git_repositories/jetstream_resources/reporting_tools/report_cpu_usage.py > ../timing_summary/${project}_timing.txt

# return to project summary to run R scripts
cd ../timing_summary
Rscript --vanilla \
# Singularity command needs to be more dynamic
# Example for gemini PMED - singularity exec -e -B /coh_labs/PMED docker://ghcr.io/tgen/jetstream_containers/r-with_modules:3.6.1 Rscript --vanilla
# This does ping ghcr.io, so a local image might be preferred.
singularity exec /home/tgenref/containers/r-with_modules_3.6.1.sif Rscript --vanilla \
/home/tgenjetstream/git_repositories/jetstream_resources/reporting_tools/summarize_project_runtime.R \
--project ${project} \
--time_summary ${project}_timing.txt \
Expand All @@ -46,9 +59,10 @@ echo

# Summarize the overall study results
echo "##################################################"
echo "Summarizing overal study timing results"
echo "Summarizing overall study timing results"

Rscript --vanilla \
# See comment above for singularity usage caveat
singularity exec /home/tgenref/containers/r-with_modules_3.6.1.sif Rscript --vanilla \
/home/tgenjetstream/git_repositories/jetstream_resources/reporting_tools/study_summary_Graphs.R \
--study_name $2

39 changes: 26 additions & 13 deletions reporting_tools/report_cpu_usage.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/usr/bin/env python3

###
# Copied from Ryan Richholt Toolkit
###

# Usage: report_cpu_usage.py <Path_to_Jetstream_Project>
# Usage: report_cpu_usage.py --project <Path_to_Jetstream_Project> --ignore_tags <tag_to_ignore> <another_tag_to_ignore>
# Arguments are optional

# Generates a report of CPU Usage from a Jetstream project
# Future improvements : leverage tags for subsummary by subtype, RG_SM, Individual Tools

# Author: Ryan Richolt
# Updated by: Bryce Turner - bturner@tgen.org

import argparse
import logging
Expand All @@ -23,7 +22,8 @@

def arg_parser():
parser = argparse.ArgumentParser(description='Report CPU hours for a project')
parser.add_argument('project', nargs='*')
parser.add_argument('--project', nargs='*')
parser.add_argument('--ignore_tags', nargs='+')
return parser


Expand All @@ -40,30 +40,43 @@ def summarize_task_cpuhs(t):
return cpus, fElapsed, cpuh


def report(project):
def report(project, ignore_tags):
total_cpu_hours = 0

print('CPUs\tElapsed\tHours\tCumulative\tTask')
print('CPUs\tElapsed\tHours\tCumulative\tTask\tTags')
workflow = project.load_workflow()
log.critical(f'Reporting on: {workflow}')
for name, t in project.load_workflow().tasks.items():
for name, t in workflow.tasks.items():
cpus, task_elapsed, task_cpu_hours = summarize_task_cpuhs(t)
total_cpu_hours += task_cpu_hours
print(f'{cpus}\t{task_elapsed}\t{round(task_cpu_hours, 2)}\t{round(total_cpu_hours, 2)}\t{t}')
tags = t.directives.get('tags', ['Untagged'])
ignore_tags_re_string = '|'.join(f'({tag})' for tag in ignore_tags).replace("-", "_")
cleanre = re.compile(ignore_tags_re_string)
# We perform a regex search because we expect the ignore tags to be partial
# for example may want to ignore 'stats' tags, 'stats' may not be an actual tag
# but it could be part of a tag, e.g. stats2json or stats2lims
clean_tags = [tag for tag in tags if not re.search(cleanre, tag.replace("-", "_")) ]
tags_string = '_'.join(clean_tags).replace(" ", "_")
print(f'{cpus}\t{task_elapsed}\t{round(task_cpu_hours, 2)}\t{round(total_cpu_hours, 2)}\t{name}\t{tags_string}')


def main():
parser = arg_parser()
args = parser.parse_args()

if args.ignore_tags:
ignore_tags = args.ignore_tags
else:
ignore_tags = []

if args.project:
for p in args.project:
try:
report(jetstream.Project(path=p))
report(jetstream.Project(path=p), ignore_tags)
except Exception as e:
log.exception(f'Failed to generate report for "{p}"')
else:
report(jetstream.Project())
report(jetstream.Project(), ignore_tags)


if __name__ == '__main__':
Expand Down
16 changes: 8 additions & 8 deletions reporting_tools/study_summary_Graphs.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,23 @@ if (is.null(opt$study_name)){
tasks <- read_tsv("study_task_summary.txt")
projects <- read_tsv("study_project_summary.txt")

ggplot(tasks, aes(x=Group, y=Total_CPU_Hours)) +
geom_jitter() +
ggplot(tasks, aes(x=Tags, y=Total_CPU_Hours)) +
geom_jitter(size=0.5) +
coord_flip()
ggsave(file=paste(opt$study_name, "_TotalCPUhours_by_Group_per_Project.png", sep=""), dpi=150)

ggplot(tasks, aes(x=Group, y=Total_Elapsed_Hours)) +
geom_jitter() +
ggplot(tasks, aes(x=Tags, y=Total_Elapsed_Hours)) +
geom_jitter(size=0.5) +
coord_flip()
ggsave(file=paste(opt$study_name, "_TotalElapsedHours_by_Group_per_Project.png", sep=""), dpi=150)

ggplot(tasks, aes(x=Group, y=Max_Task_Elapsed_Hours)) +
geom_jitter() +
ggplot(tasks, aes(x=Tags, y=Max_Task_Elapsed_Hours)) +
geom_jitter(size=0.5) +
coord_flip()
ggsave(file=paste(opt$study_name, "_MaxTaskElapsedHours_by_Group_per_Project.png", sep=""), dpi=150)

ggplot(tasks, aes(x=Group, y=Max_Task_CPU_Hours)) +
geom_jitter() +
ggplot(tasks, aes(x=Tags, y=Max_Task_CPU_Hours)) +
geom_jitter(size=0.5) +
coord_flip()
ggsave(file=paste(opt$study_name, "_MaxTaskCPUhours_by_Group_per_Project.png", sep=""), dpi=150)

Expand Down
95 changes: 9 additions & 86 deletions reporting_tools/summarize_project_runtime.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,103 +63,26 @@ project_name <- opt$project

## Update the tibble table

# Remove "<Task(complete): " and ">" from the TASK column
data <- data %>% mutate_if(is.character, str_replace_all, pattern = '<', replacement = "")
data <- data %>% mutate_if(is.character, str_replace_all, pattern = "[(]", replacement = "")
data <- data %>% mutate_if(is.character, str_replace_all, pattern = '[)]', replacement = "")
data <- data %>% mutate_if(is.character, str_replace_all, pattern = 'Taskcomplete: ', replacement = "")
data <- data %>% mutate_if(is.character, str_replace_all, pattern = '[>]', replacement = "")

# Parse the Elapsed time into hours, minutes, seconds
data <- data %>% separate(Elapsed, into = c("hours", "minutes", "seconds"), sep = ":", convert = TRUE, remove = FALSE)

# Add Summary Columns
data <- data %>% mutate(Group = case_when(str_detect(Task, "^copy_fastqs") ~ "Copy_Fastq",
str_detect(Task, "^split_fastq") ~ "Split_Fastq",
str_detect(Task, "^chunked_bwa_mem_samtools_fixmate") ~ "BWA_Align",
str_detect(Task, "^chunked_bwa_mem2_samtools_fixmate") ~ "BWA2_Align",
str_detect(Task, "^chunked_samtools_merge_rg_bams") ~ "Samtools_Merge",
str_detect(Task, "^samtools_markdup") ~ "Samtools_MarkDup",
str_detect(Task, "^bam_to_cram") ~ "Samtools_BamCram",
str_detect(Task, "^gatk_collectwgsmetrics") ~ "Picard_Metric",
str_detect(Task, "^gatk_collectwgsmetricswithnonzerocoverage") ~ "Picard_Metric",
str_detect(Task, "^gatk_collectrawwgsmetrics") ~ "Picard_Metric",
str_detect(Task, "^gatk_collectmultiplemetrics") ~ "Picard_Metric",
str_detect(Task, "^gatk_convertsequencingarrtifacttooxog") ~ "Picard_Metric",
str_detect(Task, "^gatk_collecthsmetrics") ~ "Picard_Metric",
str_detect(Task, "^gatk_collectrnaseqmetrics") ~ "Picard_Metric",
str_detect(Task, "^samtools_stats") ~ "Samtools_Metric",
str_detect(Task, "^samtools_flagstat") ~ "Samtools_Metric",
str_detect(Task, "^samtools_idxstats") ~ "Samtools_Metric",
str_detect(Task, "^verifybamid2") ~ "Random_Stat",
str_detect(Task, "^freebayes_sex_check") ~ "Random_Stat",
str_detect(Task, "^snpsniffer_geno") ~ "Random_Stat",
str_detect(Task, "^hmmcopy_make_wig_bwa") ~ "iChor_CNA",
str_detect(Task, "^ichor_cna_bwa") ~ "iChor_CNA",
str_detect(Task, "^haplotypecaller_gvcf") ~ "HaplotypeCaller",
str_detect(Task, "^haplotypecaller_gvcf_merge") ~ "HaplotypeCaller",
str_detect(Task, "^manta") ~ "Manta_Strelka",
str_detect(Task, "^strelka2_filter_variants") ~ "Variant_Filter",
str_detect(Task, "^strelka2") ~ "Manta_Strelka",
str_detect(Task, "^deepvariant_make_examples") ~ "Deepvariant",
str_detect(Task, "^deepvariant_call_variants") ~ "Deepvariant",
str_detect(Task, "^deepvariant_postprocess_variants") ~ "Deepvariant",
str_detect(Task, "^deepvariant_filter_variants") ~ "Deepvariant",
str_detect(Task, "^lancet_merge_chunks") ~ "Variant_Merge",
str_detect(Task, "^lancet_filter_variants") ~ "Variant_Filter",
str_detect(Task, "^lancet") ~ "Lancet",
str_detect(Task, "^octopus_merge_chunks") ~ "Variant_Merge",
str_detect(Task, "^octopus_filter_variants") ~ "Variant_Filter",
str_detect(Task, "^octopus") ~ "Octopus",
str_detect(Task, "^vardict_merge_chunks") ~ "Variant_Merge",
str_detect(Task, "^vardict_filter_variants") ~ "Variant_Filter",
str_detect(Task, "^vardict") ~ "VarDictJava",
str_detect(Task, "^mutect2_merge_chunks") ~ "Variant_Merge",
str_detect(Task, "^mutect2_filter_variants") ~ "Variant_Filter",
str_detect(Task, "^mutect2_filter_calls") ~ "Mutect",
str_detect(Task, "^mutect2_calculate_contamination") ~ "Mutect",
str_detect(Task, "^mutect2_merge_pileup_summaries") ~ "Mutect",
str_detect(Task, "^mutect2_learn_readorientationmodel") ~ "Mutect",
str_detect(Task, "^mutect2_merge_stats") ~ "Mutect",
str_detect(Task, "^mutect2_merge_pileup_summaries") ~ "Mutect",
str_detect(Task, "^mutect2_GetPileupSummaries") ~ "Mutect",
str_detect(Task, "^mutect2") ~ "Mutect",
str_detect(Task, "^vcfmerger2") ~ "VCFmerger",
str_detect(Task, "^bcftools_annotate") ~ "Annotation",
str_detect(Task, "^snpeff") ~ "Annotation",
str_detect(Task, "^vep") ~ "Annotation",
str_detect(Task, "^bcftools_annotate") ~ "Annotation",
str_detect(Task, "^bcftools_annotate") ~ "Annotation",
str_detect(Task, "^delly") ~ "Delly",
str_detect(Task, "^gatk_call_cnv") ~ "GATK_CNV",
str_detect(Task, "^add_matched_rna") ~ "RNA_Steps",
str_detect(Task, "^add_rna_header_to_vcf") ~ "RNA_Steps",
str_detect(Task, "^salmon_quant_cdna") ~ "RNA_Steps",
str_detect(Task, "^star_quant") ~ "RNA_Steps",
str_detect(Task, "^star_fusion") ~ "RNA_Steps",
str_detect(Task, "^fixmate_sort_star") ~ "RNA_Steps",
str_detect(Task, "^markduplicates_star_gatk") ~ "RNA_Steps",
str_detect(Task, "^rna_getBTcellLociCounts") ~ "RNA_Steps",
TRUE ~ "Misc"
)
)

data <- data %>% select("Tags", everything())

# Plot
ggplot(data, aes(x=Group, y=Elapsed, color=as.factor(CPUs))) +
geom_jitter() +
ggplot(data, aes(x=Tags, y=Elapsed, color=as.factor(CPUs))) +
geom_jitter(size=0.5) +
scale_color_discrete() +
coord_flip()
ggsave(file=paste(project_name, "_ElapsedTime_by_Task_per_Group.png", sep=""), dpi=150)

ggplot(data, aes(x=Group, y=Hours)) +
geom_jitter() +
ggplot(data, aes(x=Tags, y=Hours)) +
geom_jitter(size=0.5) +
coord_flip()
ggsave(file=paste(project_name, "_CPUhours_by_Task_per_Group.png", sep=""), dpi=150)

# Group and summarize to get realtime and CPU hours by task Group
task_summary <- data %>%
group_by(Group) %>%
group_by(Tags) %>%
summarise(Tasks = n(),
Total_CPU_Hours = sum(Hours),
Max_Task_CPU_Hours = max(Hours),
Expand All @@ -171,15 +94,15 @@ task_summary <- data %>%

# Add column with project
task_summary <- task_summary %>%
add_column(Project = project_name, .before = "Group")
add_column(Project = project_name, .before = "Tags")

# Plot Summary Data
ggplot(task_summary, aes(x=Group, y=Total_Elapsed_Hours)) +
ggplot(task_summary, aes(x=Tags, y=Total_Elapsed_Hours)) +
geom_bar(stat="identity") +
coord_flip()
ggsave(file=paste(project_name, "_ElapsedHours_by_TaskGroup.png", sep=""), dpi=150)

ggplot(task_summary, aes(x=Group, y=Total_CPU_Hours)) +
ggplot(task_summary, aes(x=Tags, y=Total_CPU_Hours)) +
geom_bar(stat="identity") +
coord_flip()
ggsave(file=paste(project_name, "_CPUhours_by_TaskGroup.png", sep=""), dpi=150)
Expand Down
Loading

0 comments on commit 45e48f6

Please sign in to comment.