From 83cb92ce5759b030f2136b681842b7cbe2170304 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 17 Jun 2024 14:52:26 -0400 Subject: [PATCH] Add VRS data to GraphQL API --- dataset-metadata/metadata.ts | 22 + graphql-api/src/graphql/resolvers/va.spec.ts | 175 ++++++++ graphql-api/src/graphql/resolvers/va.ts | 396 ++++++++++++++++++ .../src/graphql/resolvers/variant-fields.ts | 6 + graphql-api/src/graphql/resolvers/variants.ts | 44 +- graphql-api/src/graphql/types/query.graphql | 52 ++- graphql-api/src/graphql/types/va.graphql | 116 +++++ graphql-api/src/graphql/types/variant.graphql | 7 + .../gnomad-v4-variant-queries.ts | 19 +- 9 files changed, 802 insertions(+), 35 deletions(-) create mode 100644 graphql-api/src/graphql/resolvers/va.spec.ts create mode 100644 graphql-api/src/graphql/resolvers/va.ts create mode 100644 graphql-api/src/graphql/types/va.graphql diff --git a/dataset-metadata/metadata.ts b/dataset-metadata/metadata.ts index ff6fe624d..0b6ed9262 100644 --- a/dataset-metadata/metadata.ts +++ b/dataset-metadata/metadata.ts @@ -73,6 +73,7 @@ export type DatasetMetadata = { structuralVariantDatasetId: DatasetId copyNumberVariantDatasetId: DatasetId hasJointFrequencyData: boolean + hasVRSData: boolean } const metadata: Record = { @@ -124,6 +125,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r2_1: { isSubset: false, @@ -173,6 +175,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r2_1_controls: { isSubset: true, @@ -222,6 +225,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r2_1_non_cancer: { isSubset: true, @@ -271,6 +275,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r2_1_non_neuro: { isSubset: true, @@ -320,6 +325,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r2_1_non_topmed: { isSubset: true, @@ -369,6 +375,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3: { isSubset: false, @@ -418,6 +425,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3_controls_and_biobanks: { isSubset: true, @@ -467,6 +475,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3_non_cancer: { isSubset: true, @@ -516,6 +525,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3_non_neuro: { isSubset: true, @@ -565,6 +575,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3_non_topmed: { isSubset: true, @@ -614,6 +625,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_r3_non_v2: { isSubset: true, @@ -663,6 +675,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_sv_r2_1: { isSubset: false, @@ -712,6 +725,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_sv_r2_1_controls: { isSubset: true, @@ -761,6 +775,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_sv_r2_1_non_neuro: { isSubset: true, @@ -810,6 +825,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: false, + hasVRSData: false, }, gnomad_sv_r4: { isSubset: false, @@ -859,6 +875,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: true, + hasVRSData: false, }, gnomad_cnv_r4: { isSubset: false, @@ -908,6 +925,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: true, hasJointFrequencyData: true, + hasVRSData: false, }, gnomad_r4: { isSubset: false, @@ -957,6 +975,7 @@ const metadata: Record = { copyNumberVariantDatasetId: 'gnomad_cnv_r4', hasCopyNumberVariantCoverage: false, hasJointFrequencyData: true, + hasVRSData: true, }, gnomad_r4_non_ukb: { isSubset: true, @@ -1006,6 +1025,7 @@ const metadata: Record = { hasCopyNumberVariantCoverage: false, hasRelatedVariants: true, hasJointFrequencyData: false, + hasVRSData: true, }, } @@ -1156,3 +1176,5 @@ export const getTopLevelDataset = (datasetId: DatasetId) => { return 'default' } + +export const hasVRSData = (datasetId: DatasetId) => getMetadata(datasetId, 'hasVRSData') diff --git a/graphql-api/src/graphql/resolvers/va.spec.ts b/graphql-api/src/graphql/resolvers/va.spec.ts new file mode 100644 index 000000000..410b3d5bc --- /dev/null +++ b/graphql-api/src/graphql/resolvers/va.spec.ts @@ -0,0 +1,175 @@ +import { describe, expect, test } from '@jest/globals' + +import { + resolveVACohortAlleleFrequencies, + resolveVAAllele, + Allele as VAAllele, + CohortAlleleFrequency, +} from './va' + +const alleleEsDocument = { + vrs: { + ref: { + allele_id: 'ga4gh:SQ.IAmTheRefStateID', + start: 123, + end: 234, + state: 'G', + }, + alt: { + allele_id: 'ga4gh:SQ.IAmTheAltStateID', + start: 124, + end: 235, + state: 'A', + }, + }, +} + +const expectedAllele: VAAllele = { + _id: 'ga4gh:SQ.IAmTheAltStateID', + type: 'Allele', + location: { + _id: 'ga4gh:VSL.OogSNIt-1Z7HF4tbdm45IDLYc7-oSE2Y', + type: 'SequenceLocation', + sequence_id: '2mN7PzLXx-QQq2GVIODPRSkWmlwybsv0', + interval: { + type: 'SequenceInterval', + start: { type: 'Number', value: 124 }, + end: { type: 'Number', value: 235 }, + }, + }, + state: { + type: 'LiteralSequenceExpression', + sequence: 'A', + }, +} + +describe('resolveVAAllele', () => { + test('parses a single allele correctly', async () => { + const resolved = await resolveVAAllele(alleleEsDocument, null, null) + expect(resolved).toEqual(expectedAllele) + }) +}) + +describe('resolveVACohortAlleleFrequency', () => { + const exomeEsDocument = { + ac: 5, + an: 100, + hemizygote_count: 2, + homozygote_count: 3, + faf95: { popmax: 0.123, popmax_population: 'afr' }, + ancestry_groups: [], + } + + const variantESDocument = { + ...alleleEsDocument, + variant_id: '1-123-G-A', + exome: exomeEsDocument, + joint: { fafmax: { faf95_max: 0.234, faf95_max_gen_anc: 'amr' } }, + } + + test('parses a single CohortAlleleFrequency correctly', async () => { + const resolved = await resolveVACohortAlleleFrequencies(variantESDocument, null, null) + const expected: CohortAlleleFrequency[] = [ + { + id: 'gnomad4:1-123-G-A', + label: 'Overall Cohort Allele Frequency for 1-123-G-A', + type: 'CohortAlleleFrequency', + focusAllele: expectedAllele, + derivedFrom: { + id: 'gnomad4.1.0', + type: 'DataSet', + label: 'gnomAD v4.1.0', + version: '4.1.0', + }, + focusAlleleCount: 5, + locusAlleleCount: 100, + alleleFrequency: 0.05, + cohort: { id: 'ALL', label: 'Overall', characteristics: null }, + ancillaryResults: { + grpMaxFAF95: { frequency: 0.123, confidenceInterval: 0.95, groupId: 'afr' }, + jointGrpMaxFAF95: { frequency: 0.234, confidenceInterval: 0.95, groupId: 'amr' }, + homozygotes: 3, + hemizygotes: 2, + }, + subcohortFrequency: [], + }, + ] + + expect(resolved).toEqual(expected) + }) + + test('has the correct subcohortAlleleFrequency when there are multiple CAFs', async () => { + // Shuffled order of IDs is intentional here to better test sorting + const subcohortIds = [ + 'eur_XY', + 'XY', + 'ami_XX', + 'amr', + 'XX', + 'ami', + 'ami_XY', + 'amr_XX', + 'eur', + 'amr_XY', + 'eur_XX', + ] + const subcohortDocuments = subcohortIds.map((subcohortId) => ({ + ...exomeEsDocument, + id: subcohortId, + })) + + const fullDocument = { + ...variantESDocument, + exome: { ...exomeEsDocument, ancestry_groups: subcohortDocuments }, + } + + const resolved = await resolveVACohortAlleleFrequencies(fullDocument, null, null) + expect(resolved && resolved.length === subcohortIds.length + 1).toEqual(true) + + const subcohortMap: Record = resolved!.reduce( + (acc, cohort) => ({ + ...acc, + [cohort.id]: cohort.subcohortFrequency.map((subcohort) => subcohort.id), + }), + {} + ) + + expect(subcohortMap['gnomad4:1-123-G-A']!.sort()).toEqual( + subcohortIds.map((cohortId) => `gnomad4:1-123-G-A.${cohortId}`).sort() + ) + + expect(subcohortMap['gnomad4:1-123-G-A.XX'].sort()).toEqual([ + 'gnomad4:1-123-G-A.ami_XX', + 'gnomad4:1-123-G-A.amr_XX', + 'gnomad4:1-123-G-A.eur_XX', + ]) + + expect(subcohortMap['gnomad4:1-123-G-A.XY'].sort()).toEqual([ + 'gnomad4:1-123-G-A.ami_XY', + 'gnomad4:1-123-G-A.amr_XY', + 'gnomad4:1-123-G-A.eur_XY', + ]) + + expect(subcohortMap['gnomad4:1-123-G-A.ami'].sort()).toEqual([ + 'gnomad4:1-123-G-A.ami_XX', + 'gnomad4:1-123-G-A.ami_XY', + ]) + + expect(subcohortMap['gnomad4:1-123-G-A.amr'].sort()).toEqual([ + 'gnomad4:1-123-G-A.amr_XX', + 'gnomad4:1-123-G-A.amr_XY', + ]) + + expect(subcohortMap['gnomad4:1-123-G-A.eur'].sort()).toEqual([ + 'gnomad4:1-123-G-A.eur_XX', + 'gnomad4:1-123-G-A.eur_XY', + ]) + + expect(subcohortMap['gnomad4:1-123-G-A.ami_XX']).toEqual([]) + expect(subcohortMap['gnomad4:1-123-G-A.ami_XY']).toEqual([]) + expect(subcohortMap['gnomad4:1-123-G-A.amr_XX']).toEqual([]) + expect(subcohortMap['gnomad4:1-123-G-A.amr_XY']).toEqual([]) + expect(subcohortMap['gnomad4:1-123-G-A.eur_XX']).toEqual([]) + expect(subcohortMap['gnomad4:1-123-G-A.eur_XY']).toEqual([]) + }) +}) diff --git a/graphql-api/src/graphql/resolvers/va.ts b/graphql-api/src/graphql/resolvers/va.ts new file mode 100644 index 000000000..d8653e557 --- /dev/null +++ b/graphql-api/src/graphql/resolvers/va.ts @@ -0,0 +1,396 @@ +import { createHash } from 'crypto' + +const POPULATION_NAMES: Record = { + afr: 'African/African-American', + ami: 'Amish', + amr: 'Admixed American', + asj: 'Ashkenazi Jewish', + eas: 'East Asian', + eur: 'European', + fin: 'Finnish', + mid: 'Middle Eastern', + nfe: 'Non-Finnish European', + oth: 'Other', + remaining: 'Remaining individuals', + sas: 'South Asian', + uniform: 'Uniform', + sas_non_consang: 'South Asian (F < 0.05)', + consanguineous: 'South Asian (F > 0.05)', + exac: 'ExAC', + bgr: 'Bulgarian (Eastern European)', + est: 'Estonian', + gbr: 'British', + nwe: 'North-Western European', + seu: 'Southern European', + swe: 'Swedish', + kor: 'Korean', + sgp: 'Singaporean', + jpn: 'Japanese', + oea: 'Other East Asian', + oeu: 'Other European', + onf: 'Other Non-Finnish European', + unk: 'Unknown', +} + +// "VANumber" because "Number" is taken +type VANumber = { type: string; value: number } + +type SequenceInterval = { + type: string + start: VANumber + end: VANumber +} + +type SequenceLocation = { + _id: string | null + type: string + sequence_id: string + interval: SequenceInterval +} + +type UnhashedSequenceLocation = Omit + +type LiteralSequenceExpression = { + type: string + sequence: string +} + +export type Allele = { + _id: string | null + type: string + location: SequenceLocation + state: LiteralSequenceExpression +} + +type CohortCharacteristic = { + name: string + value: string +} + +type Cohort = { + id: string + label: string | null + characteristics: CohortCharacteristic[] | null +} + +type CohortAlleleFrequencyDerivation = { + id: string | null + type: string | null + label: string | null + version: string | null +} + +type GrpMaxFAF95 = { + frequency: number + confidenceInterval: number + groupId: string +} + +type AncillaryResults = { + grpMaxFAF95: GrpMaxFAF95 | null + jointGrpMaxFAF95: GrpMaxFAF95 | null + homozygotes: number | null + hemizygotes: number | null +} + +export type CohortAlleleFrequency = { + id: string + type: string + label: string | null + derivedFrom: CohortAlleleFrequencyDerivation | null + focusAllele: Allele + focusAlleleCount: number + locusAlleleCount: number + alleleFrequency: number + cohort: Cohort + ancillaryResults: AncillaryResults | null + subcohortFrequency: CohortAlleleFrequency[] +} + +type CohortAlleleFrequencyWithoutSubcohorts = Omit + +const hashWithSha512t24u = (s: string): string => { + const sha = createHash('sha512').update(s).digest() + const truncatedSha = Buffer.copyBytesFrom(sha, 0, 24) + return truncatedSha.toString('base64url') +} + +type JSONAble = string | number | Record + +const normalizedStringify = (input: JSONAble): string => { + if (typeof input === 'string' || typeof input === 'number') { + return JSON.stringify(input) + } + + const keysToSerialize = Object.keys(input) + .filter((key) => !key.startsWith('_')) + .sort() + + const serializedPairs = keysToSerialize.map((key) => { + return `"${key}":${normalizedStringify(input[key])}` + }) + return `{${serializedPairs.join(',')}}` +} + +const generateLocationId = (location: UnhashedSequenceLocation) => { + const normalizedJSON = normalizedStringify(location) + const hash = hashWithSha512t24u(normalizedJSON) + return `ga4gh:VSL.${hash}` +} + +const generateSequenceId = (sequence: string) => { + return hashWithSha512t24u(sequence) +} + +export const resolveVAAllele = async (obj: any, _args: any, _ctx: any): Promise => { + const vrsData = obj.vrs + + if (!vrsData) { + return null + } + + const { ref, alt } = vrsData + + const altVRSId = alt.allele_id as string + const refSequence = ref.state as string + const altSequence = alt.state as string + const altState: LiteralSequenceExpression = { + type: 'LiteralSequenceExpression', + sequence: altSequence, + } + const sequenceId = generateSequenceId(refSequence) + const interval: SequenceInterval = { + type: 'SequenceInterval', + start: { type: 'Number', value: alt.start }, + end: { type: 'Number', value: alt.end }, + } + const unhashedLocation: UnhashedSequenceLocation = { + type: 'SequenceLocation', + sequence_id: sequenceId, + interval, + } + const location: SequenceLocation = { + ...unhashedLocation, + _id: generateLocationId(unhashedLocation), + } + + return { _id: altVRSId, type: 'Allele', location, state: altState } +} + +type Subset = { + id?: string + ac: number + an: number + hemizygote_count: number + homozygote_count: number + grpMax?: GrpMaxFAF95 + jointGrpMax?: GrpMaxFAF95 +} + +const GNOMAD_V4_DERIVATION = { + id: 'gnomad4.1.0', + type: 'DataSet', + label: 'gnomAD v4.1.0', + version: '4.1.0', +} + +const getAncestryAndSexIds = (subsetId: string): [string | undefined, string | undefined] => { + const [first, second] = subsetId.split('_') + return first === 'XX' || first === 'XY' ? [undefined, first] : [first, second] +} + +const cohortDescription = (subsetId: string | undefined): string => { + if (subsetId === undefined) { + return 'Overall Cohort' + } + + const [ancestryGroupId, sexId] = getAncestryAndSexIds(subsetId) + + if (ancestryGroupId) { + const ancestryGroupName = POPULATION_NAMES[ancestryGroupId] + if (sexId) { + return `${ancestryGroupName} ${sexId} Ancestry Group` + } + return `${ancestryGroupName} Ancestry Group` + } + return sexId! +} + +const cohortForSubset = (subset: Subset): Cohort => { + if (!subset.id) { + return { id: 'ALL', label: 'Overall', characteristics: null } + } + + const [ancestryGroupId, sexId] = getAncestryAndSexIds(subset.id) + + const sexCharacteristics: CohortCharacteristic[] = sexId + ? [{ name: 'biological sex', value: sexId }] + : [] + const ancestryCharacteristics: CohortCharacteristic[] = + ancestryGroupId && POPULATION_NAMES[ancestryGroupId] + ? [ + { + name: 'genetic ancestry', + value: POPULATION_NAMES[ancestryGroupId], + }, + ] + : [] + const characteristics = [...sexCharacteristics, ...ancestryCharacteristics] + + return { id: subset.id || 'ALL', label: cohortDescription(subset.id), characteristics } +} + +const resolveVACohortAlleleFrequency = ( + focusAllele: Allele, + variant_id: string, + subset: Subset +): CohortAlleleFrequencyWithoutSubcohorts => { + const idSuffix = subset.id ? `.${subset.id}` : '' + const id = `gnomad4:${variant_id}${idSuffix}` + const label = `${cohortDescription(subset.id)} Allele Frequency for ${variant_id}` + + const cohort = cohortForSubset(subset) + + const ancillaryResults = { + grpMaxFAF95: subset.grpMax || null, + jointGrpMaxFAF95: subset.jointGrpMax || null, + homozygotes: subset.homozygote_count !== undefined ? subset.homozygote_count : null, + hemizygotes: subset.hemizygote_count !== undefined ? subset.hemizygote_count : null, + } + + return { + id, + label, + type: 'CohortAlleleFrequency', + focusAllele, + derivedFrom: GNOMAD_V4_DERIVATION, + focusAlleleCount: subset.ac, + locusAlleleCount: subset.an, + alleleFrequency: subset.ac / subset.an, + cohort, + ancillaryResults, + } +} + +const findSubcohortIds = (cohortId: string, possibleSubcohortIds: string[]): string[] => { + const otherCohortIds = possibleSubcohortIds.filter((otherId) => otherId !== cohortId) + + const suffix = cohortId.split('.')[1] || '' + + if (suffix === 'XX' || suffix === 'XY') { + return otherCohortIds.filter((otherCohortId) => otherCohortId.endsWith(suffix)) + } + + return otherCohortIds.filter((otherCohortId) => otherCohortId.startsWith(cohortId)) +} + +/* Quick refresher to save you checking Wikipedia at this point: topo sort + * takes a list of items that can have dependencies from one item to another, + * and returns them in an order such that, if A depends on B, B is guaranteed + * to appear before A in the output. The classic example is what package + * managers (npm, apt, rubygems, etc.) do: you want to install package A, + * which depends on B and C, and B in turn depends on D and E. The package + * manager will make sure that D and E are installed before it tries to + * install B, that B and C are installed before A, and so on. However, the + * order between B and C isn't guaranteed either way, since neither of them + * depends on the other, either directly or indirectly. + * + * In this particular case, we're ordering subcohorts, and we say A depends on + * B if B is a subset of A. The reason we use this definition is that our + * ultimate goal is to fill in the subcohortFrequency field for each + * CohortAlleleFrequency. subcohortFrequency contains a list of + * CohortAlleleFrequency, each of which will also contain its own (possibly + * empty) list of its own subcohorts in its subcohortFrequency field, so + * to compute the subcohortFrequency for A, first we must fill in B's, and so + * on recursively. + */ + +const topologicalSortLoop = ( + subcohortMap: Record, + remaining: string[], + sorted: string[] +): string[] => { + if (remaining.length === 0) { + return sorted + } + + const nextEligible = remaining.find((cohortId) => { + const subcohortIds = subcohortMap[cohortId] + return subcohortIds.every((subcohortId) => sorted.includes(subcohortId)) + })! + + const newRemaining = remaining.filter((cohortId) => cohortId !== nextEligible) + const newSorted = [...sorted, nextEligible] + return topologicalSortLoop(subcohortMap, newRemaining, newSorted) +} + +const topologicalSort = (cohortIds: string[]): string[] => { + const subcohortMap: Record = cohortIds.reduce((acc, cohortId) => { + const subcohortIds = findSubcohortIds(cohortId, cohortIds) + + return { ...acc, [cohortId]: subcohortIds } + }, {}) + + return topologicalSortLoop(subcohortMap, Object.keys(subcohortMap), []) +} + +const addSubcohorts = ( + cohortsWithoutSubcohorts: CohortAlleleFrequencyWithoutSubcohorts[] +): CohortAlleleFrequency[] => { + const cohortsById: Record = + cohortsWithoutSubcohorts.reduce((acc, cohort) => ({ ...acc, [cohort.id]: cohort }), {}) + const cohortIds = cohortsWithoutSubcohorts.map((cohort) => cohort.id) + + const sortedCohortIds: string[] = topologicalSort(cohortIds) + + const subcohortMap: Record = sortedCohortIds.reduce( + (acc, cohortId) => { + const cohort: CohortAlleleFrequencyWithoutSubcohorts = cohortsById[cohortId] + const subcohorts: CohortAlleleFrequency[] = findSubcohortIds(cohortId, cohortIds).map( + (subcohortId) => acc[subcohortId] + ) + const filledInCohort: CohortAlleleFrequency = { ...cohort, subcohortFrequency: subcohorts } + return { ...acc, [cohortId]: filledInCohort } + }, + {} as Record + ) + return Object.values(subcohortMap) +} + +export const resolveVACohortAlleleFrequencies = async ( + obj: any, + args: any, + ctx: any +): Promise => { + const focusAllele = await resolveVAAllele(obj, args, ctx) + if (focusAllele === null) { + return null + } + + const frequencies = obj.exome || obj.genome + const fullSet: Subset = { + ac: frequencies.ac, + an: frequencies.an, + hemizygote_count: frequencies.hemizygote_count, + homozygote_count: frequencies.homozygote_count, + grpMax: frequencies && { + frequency: frequencies.faf95.popmax, + groupId: frequencies.faf95.popmax_population, + confidenceInterval: 0.95, + }, + jointGrpMax: + obj.joint && obj.joint.fafmax && obj.joint.fafmax.faf95_max + ? { + frequency: obj.joint.fafmax.faf95_max, + groupId: obj.joint.fafmax.faf95_max_gen_anc, + confidenceInterval: 0.95, + } + : undefined, + } + const subsets = [fullSet, ...(frequencies.ancestry_groups as Subset[])] + const cohortsWithoutSubcohorts = subsets.map((subset) => + resolveVACohortAlleleFrequency(focusAllele, obj.variant_id, subset) + ) + + return addSubcohorts(cohortsWithoutSubcohorts) +} diff --git a/graphql-api/src/graphql/resolvers/variant-fields.ts b/graphql-api/src/graphql/resolvers/variant-fields.ts index 48bcb5c94..8e5d30c6c 100644 --- a/graphql-api/src/graphql/resolvers/variant-fields.ts +++ b/graphql-api/src/graphql/resolvers/variant-fields.ts @@ -1,9 +1,15 @@ +import { resolveVACohortAlleleFrequencies, resolveVAAllele } from './va' + const resolvers = { Variant: { rsids: (obj: any) => obj.rsids || [], + va: resolveVACohortAlleleFrequencies, + vrs: resolveVAAllele, }, VariantDetails: { rsids: (obj: any) => obj.rsids || [], + va: resolveVACohortAlleleFrequencies, + vrs: resolveVAAllele, }, } export default resolvers diff --git a/graphql-api/src/graphql/resolvers/variants.ts b/graphql-api/src/graphql/resolvers/variants.ts index b5761f8d3..e875ecd9b 100644 --- a/graphql-api/src/graphql/resolvers/variants.ts +++ b/graphql-api/src/graphql/resolvers/variants.ts @@ -15,35 +15,49 @@ import { import { fetchNccConstraintRegionById } from '../../queries/genomic-constraint-queries' +import { hasVRSData } from '@gnomad/dataset-metadata/metadata' + const resolveVariant = async (_obj: any, args: any, ctx: any) => { - if (!(args.rsid || args.variantId)) { - throw new UserVisibleError('One of "rsid" or "variantId" is required') + // These are all "variant IDs" of one kind or another but `variantId` here + // specifically refers to the chrom-pos-ref-alt style ubiquitous in gnomAD + const { rsid, variantId, vrsId, dataset } = args + + if (!dataset) { + throw new UserVisibleError('Dataset is required') } - if (args.rsid && args.variantId) { - throw new UserVisibleError('Only one of "rsid" or "variantId" is allowed') + + const nSpecifiedIds = [rsid, variantId, vrsId].filter((id) => id).length + if (nSpecifiedIds !== 1) { + throw new UserVisibleError('Exactly one of "rsid", "variantId", or "vrsId" is required') } - let variantId - if (args.variantId) { - if (!isVariantId(args.variantId)) { + let normalizedVariantId + + if (variantId) { + if (!isVariantId(variantId)) { throw new UserVisibleError('Invalid variant ID') } - variantId = normalizeVariantId(args.variantId) - } else { - if (!isRsId(args.rsid)) { + normalizedVariantId = normalizeVariantId(variantId) + } + + if (rsid) { + if (!isRsId(rsid)) { throw new UserVisibleError('Invalid rsID') } - variantId = args.rsid.toLowerCase() + normalizedVariantId = args.rsid.toLowerCase() } - const { dataset } = args - if (!dataset) { - throw new UserVisibleError('Dataset is required') + if (vrsId) { + if (!hasVRSData(dataset)) { + throw new UserVisibleError(`Dataset ${dataset} does not have VRS data`) + } + + normalizedVariantId = /^ga4gh:/.test(vrsId) ? vrsId : `ga4gh:${vrsId}` } - const variant = await fetchVariantById(ctx.esClient, dataset, variantId) + const variant = await fetchVariantById(ctx.esClient, dataset, normalizedVariantId) const posRounded = Math.floor(variant.pos / 1000) * 1000 const variantNCCId = `chr${variant.chrom}-${posRounded}-${posRounded + 1000}` const variantNCC = await fetchNccConstraintRegionById(ctx.esClient, variantNCCId) diff --git a/graphql-api/src/graphql/types/query.graphql b/graphql-api/src/graphql/types/query.graphql index 5e84f9da5..2ed7544a8 100644 --- a/graphql-api/src/graphql/types/query.graphql +++ b/graphql-api/src/graphql/types/query.graphql @@ -1,24 +1,42 @@ -type Query{ - gene(gene_id: String, gene_symbol: String, reference_genome: ReferenceGenomeId!): Gene @cost(value: 1) - region(chrom: String!, start: Int!, stop: Int!, reference_genome: ReferenceGenomeId!): Region! @cost(value: 1) - transcript(transcript_id: String!, reference_genome: ReferenceGenomeId!): Transcript @cost(value: 1) +type Query { + gene(gene_id: String, gene_symbol: String, reference_genome: ReferenceGenomeId!): Gene + @cost(value: 1) + region(chrom: String!, start: Int!, stop: Int!, reference_genome: ReferenceGenomeId!): Region! + @cost(value: 1) + transcript(transcript_id: String!, reference_genome: ReferenceGenomeId!): Transcript + @cost(value: 1) - clinvar_variant(variant_id: String!, reference_genome: ReferenceGenomeId!): ClinVarVariantDetails @cost(value: 1) - mitochondrial_variant(variant_id: String, dataset: DatasetId!): MitochondrialVariantDetails @cost(value: 1) - multiNucleotideVariant(variant_id: String!, dataset: DatasetId!): MultiNucleotideVariantDetails @cost(value: 1) - structural_variant(variantId: String!, dataset: StructuralVariantDatasetId!): StructuralVariantDetails @cost(value: 1) - variant(variantId: String, rsid: String, dataset: DatasetId!): VariantDetails @cost(value: 1) - copy_number_variant(variantId: String!, dataset: CopyNumberVariantDatasetId!): CopyNumberVariantDetails @cost(value: 1) - gene_search(query: String!, reference_genome: ReferenceGenomeId!): [GeneSearchResult!]! - variant_search(query: String!, dataset: DatasetId!): [VariantSearchResult!]! + clinvar_variant(variant_id: String!, reference_genome: ReferenceGenomeId!): ClinVarVariantDetails + @cost(value: 1) + mitochondrial_variant(variant_id: String, dataset: DatasetId!): MitochondrialVariantDetails + @cost(value: 1) + multiNucleotideVariant(variant_id: String!, dataset: DatasetId!): MultiNucleotideVariantDetails + @cost(value: 1) + structural_variant( + variantId: String! + dataset: StructuralVariantDatasetId! + ): StructuralVariantDetails @cost(value: 1) + variant(variantId: String, rsid: String, vrsId: String, dataset: DatasetId!): VariantDetails + @cost(value: 1) + copy_number_variant( + variantId: String! + dataset: CopyNumberVariantDatasetId! + ): CopyNumberVariantDetails @cost(value: 1) + gene_search(query: String!, reference_genome: ReferenceGenomeId!): [GeneSearchResult!]! + variant_search(query: String!, dataset: DatasetId!): [VariantSearchResult!]! - liftover(source_variant_id: String, liftover_variant_id: String, reference_genome: ReferenceGenomeId!): [LiftoverResult!]! + liftover( + source_variant_id: String + liftover_variant_id: String + reference_genome: ReferenceGenomeId! + ): [LiftoverResult!]! - variant_cooccurrence(variants: [String!]!, dataset: DatasetId!): VariantCooccurrence @cost(value: 5) + variant_cooccurrence(variants: [String!]!, dataset: DatasetId!): VariantCooccurrence + @cost(value: 5) - short_tandem_repeat(id: String!, dataset: DatasetId!): ShortTandemRepeatDetails @cost(value: 1) + short_tandem_repeat(id: String!, dataset: DatasetId!): ShortTandemRepeatDetails @cost(value: 1) - short_tandem_repeats(dataset: DatasetId!): [ShortTandemRepeat!]! @cost(value: 10) + short_tandem_repeats(dataset: DatasetId!): [ShortTandemRepeat!]! @cost(value: 10) - meta: BrowserMetadata! + meta: BrowserMetadata! } diff --git a/graphql-api/src/graphql/types/va.graphql b/graphql-api/src/graphql/types/va.graphql new file mode 100644 index 000000000..7a00ecfcb --- /dev/null +++ b/graphql-api/src/graphql/types/va.graphql @@ -0,0 +1,116 @@ +type VANumber { + type: String! + value: Int! +} + +type VADefiniteRange { + type: String! + min: Float! + max: Float! +} + +enum VAComparator { + LTE + GTE +} + +type VAIndefiniteRange { + type: String! + value: Float! + comparator: VAComparator! +} + +union VANumberlike = VADefiniteRange | VAIndefiniteRange | VANumber + +type VASequenceInterval { + type: String! + start: VANumber! + end: VANumber! +} + +type VACytobandInterval { + type: String! + start: String! + end: String! +} + +type VALiteralSequenceExpression { + type: String! + sequence: String! +} + +type VASequenceLocation { + _id: String + type: String! + sequence_id: String! + interval: VASequenceInterval! +} + +type VAAllele { + _id: String + type: String! + location: VASequenceLocation! + state: VALiteralSequenceExpression! +} + +type VACohortAlleleFrequencyDerivation { + id: String + type: String + label: String + version: String +} + +type VACohortCharacteristic { + name: String! + value: String! +} + +type VACohort { + id: String! + label: String + characteristics: [VACohortCharacteristic!] +} + +type VAGrpMaxFAF95 { + frequency: Float! + confidenceInterval: Float! + groupId: String! +} + +type VAAncillaryResults { + grpMaxFAF95: VAGrpMaxFAF95 + jointGrpMaxFAF95: VAGrpMaxFAF95 + homozygotes: Int + hemizygotes: Int +} + +type VAQualityMeasures { + meanDepth: Float + fractionCoverage20x: Float + qcFilters: [String!] + monoallelic: Boolean + lowComplexityRegion: Boolean + lowConfidenceLossOfFunctionError: Boolean + lossOfFunctionWarning: Boolean + noncodingTranscriptError: Boolean + heterozygousSkewedAlleleCount: Int +} + +type VACohortAlleleFrequency { + id: String! + type: String! + label: String + derivedFrom: VACohortAlleleFrequencyDerivation + focusAllele: VAAllele! + focusAlleleCount: Int! + locusAlleleCount: Int! + alleleFrequency: Float! + cohort: VACohort! + ancillaryResults: VAAncillaryResults + subcohortFrequency: [VACohortAlleleFrequency] +} + +type VA { + va: [VACohortAlleleFrequency!] + vrs: [VAAllele!] +} diff --git a/graphql-api/src/graphql/types/variant.graphql b/graphql-api/src/graphql/types/variant.graphql index e88549750..850d23cd0 100644 --- a/graphql-api/src/graphql/types/variant.graphql +++ b/graphql-api/src/graphql/types/variant.graphql @@ -189,6 +189,10 @@ type Variant { # Deprecated - use hgvsp and hgvsc instead hgvs: String + + # GA4GH-format data + va: [VACohortAlleleFrequency!] + vrs: VAAllele } type Coverage { @@ -315,6 +319,9 @@ type VariantDetails { colocatedVariants: [String!]! multiNucleotideVariants: [MultiNucleotideVariantSummary!] sortedTranscriptConsequences: [TranscriptConsequence!] + + va: [VACohortAlleleFrequency!] + vrs: VAAllele } type VariantSearchResult { diff --git a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts index 627fd402b..60bd0b73d 100644 --- a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts +++ b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts @@ -51,14 +51,27 @@ const countVariantsInRegion = async (esClient: any, region: any, _subset: Subset // Variant query // ================================================================================================ -const fetchVariantById = async (esClient: any, variantIdOrRsid: any, subset: Subset) => { - const idField = isRsId(variantIdOrRsid) ? 'rsids' : 'variant_id' +const isVrsId = (id: string) => /^ga4gh:/.test(id) + +const chooseIdField = (variantId: string) => { + if (isRsId(variantId)) { + return 'rsids' + } + + if (isVrsId(variantId)) { + return 'allele_id' + } + return 'variant_id' +} + +const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) => { + const idField = chooseIdField(variantId) const response = await esClient.search({ index: GNOMAD_V4_VARIANT_INDEX, body: { query: { bool: { - filter: { term: { [idField]: variantIdOrRsid } }, + filter: { term: { [idField]: variantId } }, }, }, },