From 824081711689c09cd3acc85bc4a77eac90448328 Mon Sep 17 00:00:00 2001 From: Matt Welland Date: Mon, 28 Oct 2024 22:42:53 +1100 Subject: [PATCH] Fix broken gCNV Annotation pipeline (#960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add the get-gencode script * use the pre-downloaded gencode data in gCNV and SV * Bump version: 1.29.4 → 1.29.5 --- .bumpversion.cfg | 2 +- .github/workflows/docker.yaml | 2 +- configs/defaults/gatk_sv_multisample.toml | 3 + configs/defaults/gcnv.toml | 3 + cpg_workflows/jobs/gcnv.py | 21 ++-- cpg_workflows/jobs/seqr_loader_sv.py | 3 + cpg_workflows/query_modules/seqr_loader_sv.py | 77 +++++------- cpg_workflows/scripts/get_gencode_gtf.sh | 15 +++ .../seqr_loader_cnv.py | 115 ++++++++++++------ cpg_workflows/stages/gcnv.py | 25 ++-- setup.py | 3 +- 11 files changed, 154 insertions(+), 115 deletions(-) create mode 100644 cpg_workflows/scripts/get_gencode_gtf.sh rename cpg_workflows/{query_modules => scripts}/seqr_loader_cnv.py (72%) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1bc2aa66c..0841ca550 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.29.4 +current_version = 1.29.5 commit = True tag = False diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 86a990387..7b1760f9e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -15,7 +15,7 @@ permissions: contents: read env: - VERSION: 1.29.4 + VERSION: 1.29.5 jobs: docker: diff --git a/configs/defaults/gatk_sv_multisample.toml b/configs/defaults/gatk_sv_multisample.toml index 872afdbb7..bdb37b17c 100644 --- a/configs/defaults/gatk_sv_multisample.toml +++ b/configs/defaults/gatk_sv_multisample.toml @@ -3,6 +3,9 @@ name = 'gatk_sv' ref_fasta = 'gs://cpg-common-main/references/hg38/v0/dragen_reference/Homo_sapiens_assembly38_masked.fasta' status_reporter = 'metamist' +# update this when cpg_workflows.scripts.get_gencode_gtf.sh is re-run +gencode_gtf_file = 'gs://cpg-common-main/references/hg38/v0/gencode_47.gtf.gz' + # Write dataset MTs for these datasets. Required for the AnnotateDatasetSv stage. # write_mt_for_datasets = [] diff --git a/configs/defaults/gcnv.toml b/configs/defaults/gcnv.toml index 512a677d0..56dbeb1f2 100644 --- a/configs/defaults/gcnv.toml +++ b/configs/defaults/gcnv.toml @@ -13,6 +13,9 @@ num_samples_per_scatter_block = 10 gncv_max_events = 320 gncv_max_pass_events = 30 +# update this when cpg_workflows.scripts.get_gencode_gtf.sh is re-run +gencode_gtf_file = 'gs://cpg-common-main/references/hg38/v0/gencode_47.gtf.gz' + [images] gatk_docker = "australia-southeast1-docker.pkg.dev/cpg-common/images/sv/gatk:2023-07-13-4.4.0.0-43-gd79823f9c-NIGHTLY-SNAPSHOT" gatk_gcnv = 'australia-southeast1-docker.pkg.dev/cpg-common/images/sv/gatk:4.2.6.1-57-g9e03432' diff --git a/cpg_workflows/jobs/gcnv.py b/cpg_workflows/jobs/gcnv.py index 09aa5a72c..8889b727a 100644 --- a/cpg_workflows/jobs/gcnv.py +++ b/cpg_workflows/jobs/gcnv.py @@ -11,7 +11,7 @@ from cpg_utils.config import config_retrieve, get_config, image_path from cpg_utils.hail_batch import command, fasta_res_group, get_batch, query_command from cpg_workflows.filetypes import CramPath -from cpg_workflows.query_modules import seqr_loader, seqr_loader_cnv +from cpg_workflows.query_modules import seqr_loader from cpg_workflows.resources import HIGHMEM from cpg_workflows.scripts import upgrade_ped_with_inferred from cpg_workflows.utils import can_reuse, chunks @@ -670,7 +670,7 @@ def update_vcf_attributes(input_tmp: str, output_file: str): def annotate_dataset_jobs_cnv( mt_path: Path, sgids: list[str], - out_mt_path: Path, + out_mt_path: str, tmp_prefix: Path, job_attrs: dict | None = None, ) -> list[Job]: @@ -688,7 +688,7 @@ def annotate_dataset_jobs_cnv( subset_j: Job | None = None if not subset_mt_path.exists(): subset_j = get_batch().new_job('subset cohort to dataset', (job_attrs or {}) | {'tool': 'hail query'}) - subset_j.image(image_path('cpg_workflows')) + subset_j.image(config_retrieve(['workflow', 'driver_image'])) subset_j.command( query_command( seqr_loader, @@ -701,16 +701,15 @@ def annotate_dataset_jobs_cnv( ) annotate_j = get_batch().new_job('annotate dataset', (job_attrs or {}) | {'tool': 'hail query'}) - annotate_j.image(image_path('cpg_workflows')) + annotate_j.image(config_retrieve(['workflow', 'driver_image'])) annotate_j.command( - query_command( - seqr_loader_cnv, - seqr_loader_cnv.annotate_dataset_gcnv.__name__, - str(subset_mt_path), - str(out_mt_path), - setup_gcp=True, - ), + 'seqr_loader_cnv ' + f'--mt_out {out_mt_path} ' + f'--checkpoint {str(tmp_prefix / "checkpoints")} ' + 'dataset ' # use the annotate_DATASET functionality + f'--mt_in {str(subset_mt_path)} ', ) + if subset_j: annotate_j.depends_on(subset_j) return [subset_j, annotate_j] diff --git a/cpg_workflows/jobs/seqr_loader_sv.py b/cpg_workflows/jobs/seqr_loader_sv.py index 0a28cc41f..26655b5a1 100644 --- a/cpg_workflows/jobs/seqr_loader_sv.py +++ b/cpg_workflows/jobs/seqr_loader_sv.py @@ -23,12 +23,15 @@ def annotate_cohort_jobs_sv( j = get_batch().new_job('Annotate cohort', job_attrs) j.image(get_config()['workflow']['driver_image']) + gencode_gz = get_config()['worfklow']['gencode_gtf_file'] + gencode_gtf_local = get_batch().read_input(str(gencode_gz)) j.command( query_command( seqr_loader_sv, seqr_loader_sv.annotate_cohort_sv.__name__, str(vcf_path), str(out_mt_path), + str(gencode_gtf_local), str(checkpoint_prefix), setup_gcp=True, ), diff --git a/cpg_workflows/query_modules/seqr_loader_sv.py b/cpg_workflows/query_modules/seqr_loader_sv.py index ace6a8531..a293d7545 100644 --- a/cpg_workflows/query_modules/seqr_loader_sv.py +++ b/cpg_workflows/query_modules/seqr_loader_sv.py @@ -4,15 +4,13 @@ import gzip import logging - -import requests +from os.path import join import hail as hl -from cpg_utils import to_path from cpg_utils.config import get_config, reference_path from cpg_utils.hail_batch import genome_build -from cpg_workflows.utils import checkpoint_hail, read_hail +from cpg_workflows.utils import generator_chunks, read_hail # I'm just going to go ahead and steal these constants from their seqr loader BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' @@ -103,48 +101,16 @@ def get_cpx_interval(x): return hl.struct(type=type_chr[0], chrom=chr_pos[0], start=hl.int32(pos[0]), end=hl.int32(pos[1])) -def download_gencode_gene_id_mapping(gencode_release: str) -> str: - """ - This is an inefficient stand-in for now. Swap this out with a more permanent - location for this file in the resources bucket - Not suuuuuuper keen on storing a pickled representation, but a minimised JSON - would be a good middle ground. Parsed ~= 62000 {str: str} - Args: - gencode_release (str | int): Which gencode release do you want? - Returns: - str - path to localised GTF file +def parse_gtf_from_local(gtf_path: str, chunk_size: int | None = None) -> hl.dict: """ + Read over the localised GTF file and read into a dict - # this is the thing - local_gtf = 'localfile.gtf.gz' - - # check for presence in config - if (config_gtf := get_config().get('gencode_gtf_path')) is not None: - # treat as GCP - if config_gtf.startswith('gs://'): - to_path(config_gtf).copy(local_gtf) - gtf_path = local_gtf - else: - gtf_path = config_gtf - else: - gtf_path = GENCODE_GTF_URL.format(gencode_release=gencode_release) - - # if it wasn't a local file, download it - if gtf_path.startswith(('http://', 'https://')): - gz_stream = requests.get(gtf_path, stream=True) - with open(local_gtf, 'wb') as f: - f.writelines(gz_stream) - gz_stream.close() - - # OK - I think that covers all the viable situations for now - return local_gtf - - -def parse_gtf_from_local(gtf_path: str) -> hl.dict: - """ - Read over the localised file and read into a dict + n.b. due to a limit in Spark of 20MB per String length, the dictionary here is actually too large to be used + in annotation expressions. To remedy this, the dictionary is returned as a list of fragments, and we can use each + one in turn, then create a checkpoint between them. Args: gtf_path (): + chunk_size (int): if specified, returns this dict as a list of dicts Returns: the gene lookup dictionary as a Hail DictExpression """ @@ -165,12 +131,24 @@ def parse_gtf_from_local(gtf_path: str) -> hl.dict: # parse info field info_fields_list = [x.strip().split() for x in record['info'].split(';') if x != ''] info_fields = {k: v.strip('"') for k, v in info_fields_list} + + # skip an ENSG: ENSG mapping, redundant... + if info_fields['gene_name'].startswith('ENSG'): + continue gene_id_mapping[info_fields['gene_name']] = info_fields['gene_id'].split('.')[0] - logging.info('Completed ingestion of gene-ID mapping') - return hl.literal(gene_id_mapping) + + all_keys = list(gene_id_mapping.keys()) + logging.info(f'Completed ingestion of gene-ID mapping, {len(all_keys)} entries') + if chunk_size is None: + return [hl.literal(gene_id_mapping)] + + # hail can't impute the type of a generator, so do this in baby steps + sub_dictionaries = [{key: gene_id_mapping[key] for key in keys} for keys in generator_chunks(all_keys, chunk_size)] + + return [hl.literal(each_dict) for each_dict in sub_dictionaries] -def annotate_cohort_sv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str | None = None): +def annotate_cohort_sv(vcf_path: str, out_mt_path: str, gencode_gz: str, checkpoint: str | None = None): """ Translate an annotated SV VCF into a Seqr-ready format Relevant gCNV specific schema @@ -180,7 +158,8 @@ def annotate_cohort_sv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str | Args: vcf_path (str): Where is the VCF?? out_mt_path (str): And where do you need output!? - checkpoint_prefix (str): CHECKPOINT!@!! + gencode_gz (str): The path to a compressed GENCODE GTF file + checkpoint (str): CHECKPOINT!@!! """ logger = logging.getLogger('annotate_cohort_sv') @@ -267,11 +246,11 @@ def annotate_cohort_sv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str | mt = mt.annotate_rows(info=mt.info.annotate(CPX_TYPE=mt.sv_types[0])) # save those changes - mt = checkpoint_hail(mt, 'initial_annotation_round.mt', checkpoint_prefix) + if checkpoint: + mt = mt.checkpoint(join(checkpoint, 'initial_annotation_round.mt')) # get the Gene-Symbol mapping dict - gene_id_mapping_file = download_gencode_gene_id_mapping(get_config().get('gencode_release', '46')) - gene_id_mapping = parse_gtf_from_local(gene_id_mapping_file) + gene_id_mapping = parse_gtf_from_local(gencode_gz)[0] # OK, NOW IT'S BUSINESS TIME conseq_predicted_gene_cols = [ diff --git a/cpg_workflows/scripts/get_gencode_gtf.sh b/cpg_workflows/scripts/get_gencode_gtf.sh new file mode 100644 index 000000000..46dd27c0e --- /dev/null +++ b/cpg_workflows/scripts/get_gencode_gtf.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# use this get_gencode_gtf.sh script to download the latest GENCODE GTF file +# and place it in the appropriate place in the CPG bucket + +set -ex + +GENCODE_VERSION=${1:-"47"} + +# download the GTF file +wget -O gencode.gtf.gz "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${GENCODE_VERSION}/gencode.v${GENCODE_VERSION}.annotation.gtf.gz" + +# don't unzip the file - unzipping inflates from ~50MB to ~1.8GB, and transfer is probably the bottleneck +# place the file in the place in the CPG bucket +gsutil cp gencode.gtf "gs://cpg-common-main/references/hg38/v0/gencode_${GENCODE_VERSION}.gtf.gz" diff --git a/cpg_workflows/query_modules/seqr_loader_cnv.py b/cpg_workflows/scripts/seqr_loader_cnv.py similarity index 72% rename from cpg_workflows/query_modules/seqr_loader_cnv.py rename to cpg_workflows/scripts/seqr_loader_cnv.py index 462546774..dcf9c9b82 100644 --- a/cpg_workflows/query_modules/seqr_loader_cnv.py +++ b/cpg_workflows/scripts/seqr_loader_cnv.py @@ -1,20 +1,16 @@ """ -Hail Query functions for seqr loader; SV edition. +Hail Query functions for seqr loader; CNV edition. """ import datetime import logging +from argparse import ArgumentParser +from os.path import join import hail as hl -from cpg_utils.config import get_config -from cpg_utils.hail_batch import genome_build -from cpg_workflows.query_modules.seqr_loader_sv import ( - download_gencode_gene_id_mapping, - get_expr_for_xpos, - parse_gtf_from_local, -) -from cpg_workflows.utils import checkpoint_hail, read_hail +from cpg_utils.hail_batch import genome_build, init_batch +from cpg_workflows.query_modules.seqr_loader_sv import get_expr_for_xpos, parse_gtf_from_local # I'm just going to go ahead and steal these constants from their seqr loader GENE_SYMBOL = 'gene_symbol' @@ -30,7 +26,7 @@ } -def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str | None = None): +def annotate_cohort_gcnv(vcf: str, mt_out: str, gencode: str, checkpoint: str, *args, **kwargs): """ Translate an annotated gCNV VCF into a Seqr-ready format Relevant gCNV specific schema @@ -38,17 +34,15 @@ def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str Relevant gCNV loader script https://github.com/populationgenomics/seqr-loading-pipelines/blob/master/luigi_pipeline/seqr_gcnv_loading.py Args: - vcf_path (str): Where is the VCF?? - out_mt_path (str): And where do you need output!? - checkpoint_prefix (str): CHECKPOINT!@!! + vcf (str): Where is the VCF?? + mt_out (str): And where do you need output!? + gencode (str): The path to a compressed GENCODE GTF file + checkpoint (str): location we can write checkpoints to """ - logger = logging.getLogger('annotate_cohort_gcnv') - logger.setLevel(logging.INFO) - - logger.info(f'Importing SV VCF {vcf_path}') + logging.info(f'Importing SV VCF {vcf}') mt = hl.import_vcf( - vcf_path, + vcf, array_elements_required=False, force_bgz=True, reference_genome=genome_build(), @@ -57,7 +51,7 @@ def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str # add attributes required for Seqr mt = mt.annotate_globals( - sourceFilePath=vcf_path, + sourceFilePath=vcf, genomeVersion=genome_build().replace('GRCh', ''), hail_version=hl.version(), datasetType='SV', @@ -89,13 +83,6 @@ def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str num_exon=hl.agg.max(mt.NP), ) - # save those changes - mt = checkpoint_hail(mt, 'initial_annotation_round.mt', checkpoint_prefix) - - # get the Gene-Symbol mapping dict - gene_id_mapping_file = download_gencode_gene_id_mapping(get_config().get('gencode_release', '46')) - gene_id_mapping = parse_gtf_from_local(gene_id_mapping_file) - # find all the column names which contain Gene symbols conseq_predicted_gene_cols = [ gene_col @@ -116,15 +103,29 @@ def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str ), ) + mt = mt.checkpoint('pre-gene_annotation.mt', overwrite=True) + + # this next section is currently failing - the dictionary of genes is too large + # to be used in an annotation expression. At least... I think it is + # for i, chunks in enumerate(chunks(gene_id_mapping, 100)): + + # get the Gene-Symbol mapping dict + gene_id_mappings = parse_gtf_from_local(gencode, chunk_size=15000) + # overwrite symbols with ENSG IDs in these columns # not sure why this is required, I think SV annotation came out # with ENSGs from the jump, but this is all symbols - for col_name in conseq_predicted_gene_cols: - mt = mt.annotate_rows( - info=mt.info.annotate( - **{col_name: hl.map(lambda gene: gene_id_mapping.get(gene, gene), mt.info[col_name])}, - ), - ) + for i, gene_id_mapping in enumerate(gene_id_mappings): + logging.info(f'Processing gene ID mapping chunk {i}') + for col_name in conseq_predicted_gene_cols: + mt = mt.annotate_rows( + info=mt.info.annotate( + **{col_name: hl.map(lambda gene: gene_id_mapping.get(gene, gene), mt.info[col_name])}, + ), + ) + + # # checkpoint this chunk + mt = mt.checkpoint(join(checkpoint, f'fragment_{i}.mt')) mt = mt.annotate_rows( # this expected mt.variant_name to be present, and it's not @@ -173,21 +174,21 @@ def annotate_cohort_gcnv(vcf_path: str, out_mt_path: str, checkpoint_prefix: str ) # write this output - mt.write(out_mt_path, overwrite=True) + mt.write(mt_out, overwrite=True) -def annotate_dataset_gcnv(mt_path: str, out_mt_path: str): +def annotate_dataset_gcnv(mt_in: str, mt_out: str, *args, **kwargs): """ process data specific to samples in this dataset do this after sub-setting to specific samples Args: - mt_path (str): path to the annotated MatrixTable - out_mt_path (str): and where do you want it to end up? + mt_in (str): path to the annotated MatrixTable + mt_out (str): and where do you want it to end up? """ logging.info('Annotating genotypes') - mt = read_hail(mt_path) + mt = hl.read_matrix_table(mt_in) # adding in the GT here, that may cause problems later? mt = mt.annotate_rows( @@ -274,5 +275,41 @@ def _filter_num_alt(i, g): ) logging.info('Genotype fields annotated') mt.describe() - mt.write(out_mt_path, overwrite=True) - logging.info(f'Written gCNV MT to {out_mt_path}') + mt.write(mt_out, overwrite=True) + logging.info(f'Written gCNV MT to {mt_out}') + + +def cli_main(): + """ + command line entrypoint + """ + # enable Info-level logging + logging.basicConfig(level=logging.INFO) + + init_batch() + + # set up an argument parser to allow two separate entrypoints + parser = ArgumentParser() + # these arguments are used for both entrypoints + parser.add_argument('--mt_out', help='Path to the MatrixTable, input or output', required=True) + parser.add_argument('--checkpoint', help='Dir to write checkpoints to', required=True) + subparsers = parser.add_subparsers() + + # a parser for the AnnotateCohort method + cohort_parser = subparsers.add_parser('cohort') + cohort_parser.add_argument('--vcf', help='Path to input VCF file') + cohort_parser.add_argument('--gencode', help='Path to input gencode GTF file') + cohort_parser.set_defaults(func=annotate_cohort_gcnv) + + # a parser for the AnnotateDataset method + cohort_parser = subparsers.add_parser('dataset') + cohort_parser.add_argument('--mt_in', help='Path to input MT') + cohort_parser.set_defaults(func=annotate_dataset_gcnv) + + args = parser.parse_args() + + args.func(**vars(args)) + + +if __name__ == '__main__': + cli_main() diff --git a/cpg_workflows/stages/gcnv.py b/cpg_workflows/stages/gcnv.py index 1978591f6..d05a99262 100644 --- a/cpg_workflows/stages/gcnv.py +++ b/cpg_workflows/stages/gcnv.py @@ -9,9 +9,8 @@ from cpg_utils import Path, to_path from cpg_utils.config import AR_GUID_NAME, config_retrieve, image_path, reference_path, try_get_ar_guid -from cpg_utils.hail_batch import get_batch, query_command +from cpg_utils.hail_batch import get_batch from cpg_workflows.jobs import gcnv -from cpg_workflows.query_modules import seqr_loader_cnv from cpg_workflows.stages.gatk_sv.gatk_sv_common import get_images, get_references, queue_annotate_sv_jobs from cpg_workflows.stages.seqr_loader import es_password from cpg_workflows.targets import Cohort, Dataset, MultiCohort, SequencingGroup @@ -612,20 +611,20 @@ def queue_jobs(self, multicohort: MultiCohort, inputs: StageInput) -> StageOutpu Fire up the job to ingest the cohort VCF as a MT, and rearrange the annotations """ - vcf_path = inputs.as_path(target=multicohort, stage=AnnotateCNVVcfWithStrvctvre, key='strvctvre_vcf') + vcf_path = str(inputs.as_path(target=multicohort, stage=AnnotateCNVVcfWithStrvctvre, key='strvctvre_vcf')) outputs = self.expected_outputs(multicohort) j = get_batch().new_job('annotate gCNV cohort', self.get_job_attrs(multicohort)) - j.image(image_path('cpg_workflows')) + j.image(config_retrieve(['worfklow', 'driver_image'])) + gencode_gz = config_retrieve(['worfklow', 'gencode_gtf_file']) + gencode_gtf_local = get_batch().read_input(str(gencode_gz)) j.command( - query_command( - seqr_loader_cnv, - seqr_loader_cnv.annotate_cohort_gcnv.__name__, - str(vcf_path), - str(outputs['mt']), - str(self.tmp_prefix / 'checkpoints'), - setup_gcp=True, - ), + 'seqr_loader_cnv ' + f'--mt_out {str(outputs["mt"])} ' + f'--checkpoint {str(self.tmp_prefix / "checkpoints")} ' + f'cohort ' # use the annotate_COHORT functionality + f'--vcf {vcf_path} ' + f'--gencode {gencode_gtf_local} ', ) return self.make_outputs(multicohort, data=outputs, jobs=j) @@ -657,7 +656,7 @@ def queue_jobs(self, dataset: Dataset, inputs: StageInput) -> StageOutput | None jobs = gcnv.annotate_dataset_jobs_cnv( mt_path=mt_in, sgids=dataset.get_sequencing_group_ids(), - out_mt_path=outputs['mt'], + out_mt_path=str(outputs['mt']), tmp_prefix=self.tmp_prefix / dataset.name / 'checkpoints', job_attrs=self.get_job_attrs(dataset), ) diff --git a/setup.py b/setup.py index fa1a3d614..746f0b8e6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='cpg-workflows', # This tag is automatically updated by bumpversion - version='1.29.4', + version='1.29.5', description='CPG workflows for Hail Batch', long_description=open('README.md').read(), long_description_content_type='text/markdown', @@ -68,6 +68,7 @@ 'mt_to_es = cpg_workflows.scripts.mt_to_es_without_dataproc:main', # Generate new intervals from a MatrixTable 'new_intervals_from_mt = cpg_workflows.scripts.generate_new_intervals:cli_main', + 'seqr_loader_cnv = cpg_workflows.scripts.seqr_loader_cnv:cli_main', ], }, )