From 83cb92ce5759b030f2136b681842b7cbe2170304 Mon Sep 17 00:00:00 2001
From: Phil Darnowsky <pdarnows@broadinstitute.org>
Date: Mon, 17 Jun 2024 14:52:26 -0400
Subject: [PATCH] Add VRS data to GraphQL API

---
 dataset-metadata/metadata.ts                  |  22 +
 graphql-api/src/graphql/resolvers/va.spec.ts  | 175 ++++++++
 graphql-api/src/graphql/resolvers/va.ts       | 396 ++++++++++++++++++
 .../src/graphql/resolvers/variant-fields.ts   |   6 +
 graphql-api/src/graphql/resolvers/variants.ts |  44 +-
 graphql-api/src/graphql/types/query.graphql   |  52 ++-
 graphql-api/src/graphql/types/va.graphql      | 116 +++++
 graphql-api/src/graphql/types/variant.graphql |   7 +
 .../gnomad-v4-variant-queries.ts              |  19 +-
 9 files changed, 802 insertions(+), 35 deletions(-)
 create mode 100644 graphql-api/src/graphql/resolvers/va.spec.ts
 create mode 100644 graphql-api/src/graphql/resolvers/va.ts
 create mode 100644 graphql-api/src/graphql/types/va.graphql

diff --git a/dataset-metadata/metadata.ts b/dataset-metadata/metadata.ts
index ff6fe624d..0b6ed9262 100644
--- a/dataset-metadata/metadata.ts
+++ b/dataset-metadata/metadata.ts
@@ -73,6 +73,7 @@ export type DatasetMetadata = {
   structuralVariantDatasetId: DatasetId
   copyNumberVariantDatasetId: DatasetId
   hasJointFrequencyData: boolean
+  hasVRSData: boolean
 }
 
 const metadata: Record<DatasetId, DatasetMetadata> = {
@@ -124,6 +125,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r2_1: {
     isSubset: false,
@@ -173,6 +175,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r2_1_controls: {
     isSubset: true,
@@ -222,6 +225,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r2_1_non_cancer: {
     isSubset: true,
@@ -271,6 +275,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r2_1_non_neuro: {
     isSubset: true,
@@ -320,6 +325,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r2_1_non_topmed: {
     isSubset: true,
@@ -369,6 +375,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3: {
     isSubset: false,
@@ -418,6 +425,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3_controls_and_biobanks: {
     isSubset: true,
@@ -467,6 +475,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3_non_cancer: {
     isSubset: true,
@@ -516,6 +525,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3_non_neuro: {
     isSubset: true,
@@ -565,6 +575,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3_non_topmed: {
     isSubset: true,
@@ -614,6 +625,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_r3_non_v2: {
     isSubset: true,
@@ -663,6 +675,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_sv_r2_1: {
     isSubset: false,
@@ -712,6 +725,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_sv_r2_1_controls: {
     isSubset: true,
@@ -761,6 +775,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_sv_r2_1_non_neuro: {
     isSubset: true,
@@ -810,6 +825,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: false,
+    hasVRSData: false,
   },
   gnomad_sv_r4: {
     isSubset: false,
@@ -859,6 +875,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: true,
+    hasVRSData: false,
   },
   gnomad_cnv_r4: {
     isSubset: false,
@@ -908,6 +925,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: true,
     hasJointFrequencyData: true,
+    hasVRSData: false,
   },
   gnomad_r4: {
     isSubset: false,
@@ -957,6 +975,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     copyNumberVariantDatasetId: 'gnomad_cnv_r4',
     hasCopyNumberVariantCoverage: false,
     hasJointFrequencyData: true,
+    hasVRSData: true,
   },
   gnomad_r4_non_ukb: {
     isSubset: true,
@@ -1006,6 +1025,7 @@ const metadata: Record<DatasetId, DatasetMetadata> = {
     hasCopyNumberVariantCoverage: false,
     hasRelatedVariants: true,
     hasJointFrequencyData: false,
+    hasVRSData: true,
   },
 }
 
@@ -1156,3 +1176,5 @@ export const getTopLevelDataset = (datasetId: DatasetId) => {
 
   return 'default'
 }
+
+export const hasVRSData = (datasetId: DatasetId) => getMetadata(datasetId, 'hasVRSData')
diff --git a/graphql-api/src/graphql/resolvers/va.spec.ts b/graphql-api/src/graphql/resolvers/va.spec.ts
new file mode 100644
index 000000000..410b3d5bc
--- /dev/null
+++ b/graphql-api/src/graphql/resolvers/va.spec.ts
@@ -0,0 +1,175 @@
+import { describe, expect, test } from '@jest/globals'
+
+import {
+  resolveVACohortAlleleFrequencies,
+  resolveVAAllele,
+  Allele as VAAllele,
+  CohortAlleleFrequency,
+} from './va'
+
+const alleleEsDocument = {
+  vrs: {
+    ref: {
+      allele_id: 'ga4gh:SQ.IAmTheRefStateID',
+      start: 123,
+      end: 234,
+      state: 'G',
+    },
+    alt: {
+      allele_id: 'ga4gh:SQ.IAmTheAltStateID',
+      start: 124,
+      end: 235,
+      state: 'A',
+    },
+  },
+}
+
+const expectedAllele: VAAllele = {
+  _id: 'ga4gh:SQ.IAmTheAltStateID',
+  type: 'Allele',
+  location: {
+    _id: 'ga4gh:VSL.OogSNIt-1Z7HF4tbdm45IDLYc7-oSE2Y',
+    type: 'SequenceLocation',
+    sequence_id: '2mN7PzLXx-QQq2GVIODPRSkWmlwybsv0',
+    interval: {
+      type: 'SequenceInterval',
+      start: { type: 'Number', value: 124 },
+      end: { type: 'Number', value: 235 },
+    },
+  },
+  state: {
+    type: 'LiteralSequenceExpression',
+    sequence: 'A',
+  },
+}
+
+describe('resolveVAAllele', () => {
+  test('parses a single allele correctly', async () => {
+    const resolved = await resolveVAAllele(alleleEsDocument, null, null)
+    expect(resolved).toEqual(expectedAllele)
+  })
+})
+
+describe('resolveVACohortAlleleFrequency', () => {
+  const exomeEsDocument = {
+    ac: 5,
+    an: 100,
+    hemizygote_count: 2,
+    homozygote_count: 3,
+    faf95: { popmax: 0.123, popmax_population: 'afr' },
+    ancestry_groups: [],
+  }
+
+  const variantESDocument = {
+    ...alleleEsDocument,
+    variant_id: '1-123-G-A',
+    exome: exomeEsDocument,
+    joint: { fafmax: { faf95_max: 0.234, faf95_max_gen_anc: 'amr' } },
+  }
+
+  test('parses a single CohortAlleleFrequency correctly', async () => {
+    const resolved = await resolveVACohortAlleleFrequencies(variantESDocument, null, null)
+    const expected: CohortAlleleFrequency[] = [
+      {
+        id: 'gnomad4:1-123-G-A',
+        label: 'Overall Cohort Allele Frequency for 1-123-G-A',
+        type: 'CohortAlleleFrequency',
+        focusAllele: expectedAllele,
+        derivedFrom: {
+          id: 'gnomad4.1.0',
+          type: 'DataSet',
+          label: 'gnomAD v4.1.0',
+          version: '4.1.0',
+        },
+        focusAlleleCount: 5,
+        locusAlleleCount: 100,
+        alleleFrequency: 0.05,
+        cohort: { id: 'ALL', label: 'Overall', characteristics: null },
+        ancillaryResults: {
+          grpMaxFAF95: { frequency: 0.123, confidenceInterval: 0.95, groupId: 'afr' },
+          jointGrpMaxFAF95: { frequency: 0.234, confidenceInterval: 0.95, groupId: 'amr' },
+          homozygotes: 3,
+          hemizygotes: 2,
+        },
+        subcohortFrequency: [],
+      },
+    ]
+
+    expect(resolved).toEqual(expected)
+  })
+
+  test('has the correct subcohortAlleleFrequency when there are multiple CAFs', async () => {
+    // Shuffled order of IDs is intentional here to better test sorting
+    const subcohortIds = [
+      'eur_XY',
+      'XY',
+      'ami_XX',
+      'amr',
+      'XX',
+      'ami',
+      'ami_XY',
+      'amr_XX',
+      'eur',
+      'amr_XY',
+      'eur_XX',
+    ]
+    const subcohortDocuments = subcohortIds.map((subcohortId) => ({
+      ...exomeEsDocument,
+      id: subcohortId,
+    }))
+
+    const fullDocument = {
+      ...variantESDocument,
+      exome: { ...exomeEsDocument, ancestry_groups: subcohortDocuments },
+    }
+
+    const resolved = await resolveVACohortAlleleFrequencies(fullDocument, null, null)
+    expect(resolved && resolved.length === subcohortIds.length + 1).toEqual(true)
+
+    const subcohortMap: Record<string, string[]> = resolved!.reduce(
+      (acc, cohort) => ({
+        ...acc,
+        [cohort.id]: cohort.subcohortFrequency.map((subcohort) => subcohort.id),
+      }),
+      {}
+    )
+
+    expect(subcohortMap['gnomad4:1-123-G-A']!.sort()).toEqual(
+      subcohortIds.map((cohortId) => `gnomad4:1-123-G-A.${cohortId}`).sort()
+    )
+
+    expect(subcohortMap['gnomad4:1-123-G-A.XX'].sort()).toEqual([
+      'gnomad4:1-123-G-A.ami_XX',
+      'gnomad4:1-123-G-A.amr_XX',
+      'gnomad4:1-123-G-A.eur_XX',
+    ])
+
+    expect(subcohortMap['gnomad4:1-123-G-A.XY'].sort()).toEqual([
+      'gnomad4:1-123-G-A.ami_XY',
+      'gnomad4:1-123-G-A.amr_XY',
+      'gnomad4:1-123-G-A.eur_XY',
+    ])
+
+    expect(subcohortMap['gnomad4:1-123-G-A.ami'].sort()).toEqual([
+      'gnomad4:1-123-G-A.ami_XX',
+      'gnomad4:1-123-G-A.ami_XY',
+    ])
+
+    expect(subcohortMap['gnomad4:1-123-G-A.amr'].sort()).toEqual([
+      'gnomad4:1-123-G-A.amr_XX',
+      'gnomad4:1-123-G-A.amr_XY',
+    ])
+
+    expect(subcohortMap['gnomad4:1-123-G-A.eur'].sort()).toEqual([
+      'gnomad4:1-123-G-A.eur_XX',
+      'gnomad4:1-123-G-A.eur_XY',
+    ])
+
+    expect(subcohortMap['gnomad4:1-123-G-A.ami_XX']).toEqual([])
+    expect(subcohortMap['gnomad4:1-123-G-A.ami_XY']).toEqual([])
+    expect(subcohortMap['gnomad4:1-123-G-A.amr_XX']).toEqual([])
+    expect(subcohortMap['gnomad4:1-123-G-A.amr_XY']).toEqual([])
+    expect(subcohortMap['gnomad4:1-123-G-A.eur_XX']).toEqual([])
+    expect(subcohortMap['gnomad4:1-123-G-A.eur_XY']).toEqual([])
+  })
+})
diff --git a/graphql-api/src/graphql/resolvers/va.ts b/graphql-api/src/graphql/resolvers/va.ts
new file mode 100644
index 000000000..d8653e557
--- /dev/null
+++ b/graphql-api/src/graphql/resolvers/va.ts
@@ -0,0 +1,396 @@
+import { createHash } from 'crypto'
+
+const POPULATION_NAMES: Record<string, string> = {
+  afr: 'African/African-American',
+  ami: 'Amish',
+  amr: 'Admixed American',
+  asj: 'Ashkenazi Jewish',
+  eas: 'East Asian',
+  eur: 'European',
+  fin: 'Finnish',
+  mid: 'Middle Eastern',
+  nfe: 'Non-Finnish European',
+  oth: 'Other',
+  remaining: 'Remaining individuals',
+  sas: 'South Asian',
+  uniform: 'Uniform',
+  sas_non_consang: 'South Asian (F < 0.05)',
+  consanguineous: 'South Asian (F > 0.05)',
+  exac: 'ExAC',
+  bgr: 'Bulgarian (Eastern European)',
+  est: 'Estonian',
+  gbr: 'British',
+  nwe: 'North-Western European',
+  seu: 'Southern European',
+  swe: 'Swedish',
+  kor: 'Korean',
+  sgp: 'Singaporean',
+  jpn: 'Japanese',
+  oea: 'Other East Asian',
+  oeu: 'Other European',
+  onf: 'Other Non-Finnish European',
+  unk: 'Unknown',
+}
+
+// "VANumber" because "Number" is taken
+type VANumber = { type: string; value: number }
+
+type SequenceInterval = {
+  type: string
+  start: VANumber
+  end: VANumber
+}
+
+type SequenceLocation = {
+  _id: string | null
+  type: string
+  sequence_id: string
+  interval: SequenceInterval
+}
+
+type UnhashedSequenceLocation = Omit<SequenceLocation, '_id'>
+
+type LiteralSequenceExpression = {
+  type: string
+  sequence: string
+}
+
+export type Allele = {
+  _id: string | null
+  type: string
+  location: SequenceLocation
+  state: LiteralSequenceExpression
+}
+
+type CohortCharacteristic = {
+  name: string
+  value: string
+}
+
+type Cohort = {
+  id: string
+  label: string | null
+  characteristics: CohortCharacteristic[] | null
+}
+
+type CohortAlleleFrequencyDerivation = {
+  id: string | null
+  type: string | null
+  label: string | null
+  version: string | null
+}
+
+type GrpMaxFAF95 = {
+  frequency: number
+  confidenceInterval: number
+  groupId: string
+}
+
+type AncillaryResults = {
+  grpMaxFAF95: GrpMaxFAF95 | null
+  jointGrpMaxFAF95: GrpMaxFAF95 | null
+  homozygotes: number | null
+  hemizygotes: number | null
+}
+
+export type CohortAlleleFrequency = {
+  id: string
+  type: string
+  label: string | null
+  derivedFrom: CohortAlleleFrequencyDerivation | null
+  focusAllele: Allele
+  focusAlleleCount: number
+  locusAlleleCount: number
+  alleleFrequency: number
+  cohort: Cohort
+  ancillaryResults: AncillaryResults | null
+  subcohortFrequency: CohortAlleleFrequency[]
+}
+
+type CohortAlleleFrequencyWithoutSubcohorts = Omit<CohortAlleleFrequency, 'subcohortFrequency'>
+
+const hashWithSha512t24u = (s: string): string => {
+  const sha = createHash('sha512').update(s).digest()
+  const truncatedSha = Buffer.copyBytesFrom(sha, 0, 24)
+  return truncatedSha.toString('base64url')
+}
+
+type JSONAble = string | number | Record<string, any>
+
+const normalizedStringify = (input: JSONAble): string => {
+  if (typeof input === 'string' || typeof input === 'number') {
+    return JSON.stringify(input)
+  }
+
+  const keysToSerialize = Object.keys(input)
+    .filter((key) => !key.startsWith('_'))
+    .sort()
+
+  const serializedPairs = keysToSerialize.map((key) => {
+    return `"${key}":${normalizedStringify(input[key])}`
+  })
+  return `{${serializedPairs.join(',')}}`
+}
+
+const generateLocationId = (location: UnhashedSequenceLocation) => {
+  const normalizedJSON = normalizedStringify(location)
+  const hash = hashWithSha512t24u(normalizedJSON)
+  return `ga4gh:VSL.${hash}`
+}
+
+const generateSequenceId = (sequence: string) => {
+  return hashWithSha512t24u(sequence)
+}
+
+export const resolveVAAllele = async (obj: any, _args: any, _ctx: any): Promise<Allele | null> => {
+  const vrsData = obj.vrs
+
+  if (!vrsData) {
+    return null
+  }
+
+  const { ref, alt } = vrsData
+
+  const altVRSId = alt.allele_id as string
+  const refSequence = ref.state as string
+  const altSequence = alt.state as string
+  const altState: LiteralSequenceExpression = {
+    type: 'LiteralSequenceExpression',
+    sequence: altSequence,
+  }
+  const sequenceId = generateSequenceId(refSequence)
+  const interval: SequenceInterval = {
+    type: 'SequenceInterval',
+    start: { type: 'Number', value: alt.start },
+    end: { type: 'Number', value: alt.end },
+  }
+  const unhashedLocation: UnhashedSequenceLocation = {
+    type: 'SequenceLocation',
+    sequence_id: sequenceId,
+    interval,
+  }
+  const location: SequenceLocation = {
+    ...unhashedLocation,
+    _id: generateLocationId(unhashedLocation),
+  }
+
+  return { _id: altVRSId, type: 'Allele', location, state: altState }
+}
+
+type Subset = {
+  id?: string
+  ac: number
+  an: number
+  hemizygote_count: number
+  homozygote_count: number
+  grpMax?: GrpMaxFAF95
+  jointGrpMax?: GrpMaxFAF95
+}
+
+const GNOMAD_V4_DERIVATION = {
+  id: 'gnomad4.1.0',
+  type: 'DataSet',
+  label: 'gnomAD v4.1.0',
+  version: '4.1.0',
+}
+
+const getAncestryAndSexIds = (subsetId: string): [string | undefined, string | undefined] => {
+  const [first, second] = subsetId.split('_')
+  return first === 'XX' || first === 'XY' ? [undefined, first] : [first, second]
+}
+
+const cohortDescription = (subsetId: string | undefined): string => {
+  if (subsetId === undefined) {
+    return 'Overall Cohort'
+  }
+
+  const [ancestryGroupId, sexId] = getAncestryAndSexIds(subsetId)
+
+  if (ancestryGroupId) {
+    const ancestryGroupName = POPULATION_NAMES[ancestryGroupId]
+    if (sexId) {
+      return `${ancestryGroupName} ${sexId} Ancestry Group`
+    }
+    return `${ancestryGroupName} Ancestry Group`
+  }
+  return sexId!
+}
+
+const cohortForSubset = (subset: Subset): Cohort => {
+  if (!subset.id) {
+    return { id: 'ALL', label: 'Overall', characteristics: null }
+  }
+
+  const [ancestryGroupId, sexId] = getAncestryAndSexIds(subset.id)
+
+  const sexCharacteristics: CohortCharacteristic[] = sexId
+    ? [{ name: 'biological sex', value: sexId }]
+    : []
+  const ancestryCharacteristics: CohortCharacteristic[] =
+    ancestryGroupId && POPULATION_NAMES[ancestryGroupId]
+      ? [
+          {
+            name: 'genetic ancestry',
+            value: POPULATION_NAMES[ancestryGroupId],
+          },
+        ]
+      : []
+  const characteristics = [...sexCharacteristics, ...ancestryCharacteristics]
+
+  return { id: subset.id || 'ALL', label: cohortDescription(subset.id), characteristics }
+}
+
+const resolveVACohortAlleleFrequency = (
+  focusAllele: Allele,
+  variant_id: string,
+  subset: Subset
+): CohortAlleleFrequencyWithoutSubcohorts => {
+  const idSuffix = subset.id ? `.${subset.id}` : ''
+  const id = `gnomad4:${variant_id}${idSuffix}`
+  const label = `${cohortDescription(subset.id)} Allele Frequency for ${variant_id}`
+
+  const cohort = cohortForSubset(subset)
+
+  const ancillaryResults = {
+    grpMaxFAF95: subset.grpMax || null,
+    jointGrpMaxFAF95: subset.jointGrpMax || null,
+    homozygotes: subset.homozygote_count !== undefined ? subset.homozygote_count : null,
+    hemizygotes: subset.hemizygote_count !== undefined ? subset.hemizygote_count : null,
+  }
+
+  return {
+    id,
+    label,
+    type: 'CohortAlleleFrequency',
+    focusAllele,
+    derivedFrom: GNOMAD_V4_DERIVATION,
+    focusAlleleCount: subset.ac,
+    locusAlleleCount: subset.an,
+    alleleFrequency: subset.ac / subset.an,
+    cohort,
+    ancillaryResults,
+  }
+}
+
+const findSubcohortIds = (cohortId: string, possibleSubcohortIds: string[]): string[] => {
+  const otherCohortIds = possibleSubcohortIds.filter((otherId) => otherId !== cohortId)
+
+  const suffix = cohortId.split('.')[1] || ''
+
+  if (suffix === 'XX' || suffix === 'XY') {
+    return otherCohortIds.filter((otherCohortId) => otherCohortId.endsWith(suffix))
+  }
+
+  return otherCohortIds.filter((otherCohortId) => otherCohortId.startsWith(cohortId))
+}
+
+/* Quick refresher to save you checking Wikipedia at this point: topo sort
+ * takes a list of items that can have dependencies from one item to another,
+ * and returns them in an order such that, if A depends on B, B is guaranteed
+ * to appear before A in the output. The classic example is what package
+ * managers (npm, apt, rubygems, etc.) do: you want to install package A,
+ * which depends on B and C, and B in turn depends on D and E. The package
+ * manager will make sure that D and E are installed before it tries to
+ * install B, that B and C are installed before A, and so on. However, the
+ * order between B and C isn't guaranteed either way, since neither of them
+ * depends on the other, either directly or indirectly.
+ *
+ * In this particular case, we're ordering subcohorts, and we say A depends on
+ * B if B is a subset of A. The reason we use this definition is that our
+ * ultimate goal is to fill in the subcohortFrequency field for each
+ * CohortAlleleFrequency. subcohortFrequency contains a list of
+ * CohortAlleleFrequency, each of which will also contain its own (possibly
+ * empty) list of its own subcohorts in its subcohortFrequency field, so
+ * to compute the subcohortFrequency for A, first we must fill in B's, and so
+ * on recursively.
+ */
+
+const topologicalSortLoop = (
+  subcohortMap: Record<string, string[]>,
+  remaining: string[],
+  sorted: string[]
+): string[] => {
+  if (remaining.length === 0) {
+    return sorted
+  }
+
+  const nextEligible = remaining.find((cohortId) => {
+    const subcohortIds = subcohortMap[cohortId]
+    return subcohortIds.every((subcohortId) => sorted.includes(subcohortId))
+  })!
+
+  const newRemaining = remaining.filter((cohortId) => cohortId !== nextEligible)
+  const newSorted = [...sorted, nextEligible]
+  return topologicalSortLoop(subcohortMap, newRemaining, newSorted)
+}
+
+const topologicalSort = (cohortIds: string[]): string[] => {
+  const subcohortMap: Record<string, string[]> = cohortIds.reduce((acc, cohortId) => {
+    const subcohortIds = findSubcohortIds(cohortId, cohortIds)
+
+    return { ...acc, [cohortId]: subcohortIds }
+  }, {})
+
+  return topologicalSortLoop(subcohortMap, Object.keys(subcohortMap), [])
+}
+
+const addSubcohorts = (
+  cohortsWithoutSubcohorts: CohortAlleleFrequencyWithoutSubcohorts[]
+): CohortAlleleFrequency[] => {
+  const cohortsById: Record<string, CohortAlleleFrequencyWithoutSubcohorts> =
+    cohortsWithoutSubcohorts.reduce((acc, cohort) => ({ ...acc, [cohort.id]: cohort }), {})
+  const cohortIds = cohortsWithoutSubcohorts.map((cohort) => cohort.id)
+
+  const sortedCohortIds: string[] = topologicalSort(cohortIds)
+
+  const subcohortMap: Record<string, CohortAlleleFrequency> = sortedCohortIds.reduce(
+    (acc, cohortId) => {
+      const cohort: CohortAlleleFrequencyWithoutSubcohorts = cohortsById[cohortId]
+      const subcohorts: CohortAlleleFrequency[] = findSubcohortIds(cohortId, cohortIds).map(
+        (subcohortId) => acc[subcohortId]
+      )
+      const filledInCohort: CohortAlleleFrequency = { ...cohort, subcohortFrequency: subcohorts }
+      return { ...acc, [cohortId]: filledInCohort }
+    },
+    {} as Record<string, CohortAlleleFrequency>
+  )
+  return Object.values(subcohortMap)
+}
+
+export const resolveVACohortAlleleFrequencies = async (
+  obj: any,
+  args: any,
+  ctx: any
+): Promise<CohortAlleleFrequency[] | null> => {
+  const focusAllele = await resolveVAAllele(obj, args, ctx)
+  if (focusAllele === null) {
+    return null
+  }
+
+  const frequencies = obj.exome || obj.genome
+  const fullSet: Subset = {
+    ac: frequencies.ac,
+    an: frequencies.an,
+    hemizygote_count: frequencies.hemizygote_count,
+    homozygote_count: frequencies.homozygote_count,
+    grpMax: frequencies && {
+      frequency: frequencies.faf95.popmax,
+      groupId: frequencies.faf95.popmax_population,
+      confidenceInterval: 0.95,
+    },
+    jointGrpMax:
+      obj.joint && obj.joint.fafmax && obj.joint.fafmax.faf95_max
+        ? {
+            frequency: obj.joint.fafmax.faf95_max,
+            groupId: obj.joint.fafmax.faf95_max_gen_anc,
+            confidenceInterval: 0.95,
+          }
+        : undefined,
+  }
+  const subsets = [fullSet, ...(frequencies.ancestry_groups as Subset[])]
+  const cohortsWithoutSubcohorts = subsets.map((subset) =>
+    resolveVACohortAlleleFrequency(focusAllele, obj.variant_id, subset)
+  )
+
+  return addSubcohorts(cohortsWithoutSubcohorts)
+}
diff --git a/graphql-api/src/graphql/resolvers/variant-fields.ts b/graphql-api/src/graphql/resolvers/variant-fields.ts
index 48bcb5c94..8e5d30c6c 100644
--- a/graphql-api/src/graphql/resolvers/variant-fields.ts
+++ b/graphql-api/src/graphql/resolvers/variant-fields.ts
@@ -1,9 +1,15 @@
+import { resolveVACohortAlleleFrequencies, resolveVAAllele } from './va'
+
 const resolvers = {
   Variant: {
     rsids: (obj: any) => obj.rsids || [],
+    va: resolveVACohortAlleleFrequencies,
+    vrs: resolveVAAllele,
   },
   VariantDetails: {
     rsids: (obj: any) => obj.rsids || [],
+    va: resolveVACohortAlleleFrequencies,
+    vrs: resolveVAAllele,
   },
 }
 export default resolvers
diff --git a/graphql-api/src/graphql/resolvers/variants.ts b/graphql-api/src/graphql/resolvers/variants.ts
index b5761f8d3..e875ecd9b 100644
--- a/graphql-api/src/graphql/resolvers/variants.ts
+++ b/graphql-api/src/graphql/resolvers/variants.ts
@@ -15,35 +15,49 @@ import {
 
 import { fetchNccConstraintRegionById } from '../../queries/genomic-constraint-queries'
 
+import { hasVRSData } from '@gnomad/dataset-metadata/metadata'
+
 const resolveVariant = async (_obj: any, args: any, ctx: any) => {
-  if (!(args.rsid || args.variantId)) {
-    throw new UserVisibleError('One of "rsid" or "variantId" is required')
+  // These are all "variant IDs" of one kind or another but `variantId` here
+  // specifically refers to the chrom-pos-ref-alt style ubiquitous in gnomAD
+  const { rsid, variantId, vrsId, dataset } = args
+
+  if (!dataset) {
+    throw new UserVisibleError('Dataset is required')
   }
-  if (args.rsid && args.variantId) {
-    throw new UserVisibleError('Only one of "rsid" or "variantId" is allowed')
+
+  const nSpecifiedIds = [rsid, variantId, vrsId].filter((id) => id).length
+  if (nSpecifiedIds !== 1) {
+    throw new UserVisibleError('Exactly one of "rsid", "variantId", or "vrsId" is required')
   }
 
-  let variantId
-  if (args.variantId) {
-    if (!isVariantId(args.variantId)) {
+  let normalizedVariantId
+
+  if (variantId) {
+    if (!isVariantId(variantId)) {
       throw new UserVisibleError('Invalid variant ID')
     }
 
-    variantId = normalizeVariantId(args.variantId)
-  } else {
-    if (!isRsId(args.rsid)) {
+    normalizedVariantId = normalizeVariantId(variantId)
+  }
+
+  if (rsid) {
+    if (!isRsId(rsid)) {
       throw new UserVisibleError('Invalid rsID')
     }
 
-    variantId = args.rsid.toLowerCase()
+    normalizedVariantId = args.rsid.toLowerCase()
   }
 
-  const { dataset } = args
-  if (!dataset) {
-    throw new UserVisibleError('Dataset is required')
+  if (vrsId) {
+    if (!hasVRSData(dataset)) {
+      throw new UserVisibleError(`Dataset ${dataset} does not have VRS data`)
+    }
+
+    normalizedVariantId = /^ga4gh:/.test(vrsId) ? vrsId : `ga4gh:${vrsId}`
   }
 
-  const variant = await fetchVariantById(ctx.esClient, dataset, variantId)
+  const variant = await fetchVariantById(ctx.esClient, dataset, normalizedVariantId)
   const posRounded = Math.floor(variant.pos / 1000) * 1000
   const variantNCCId = `chr${variant.chrom}-${posRounded}-${posRounded + 1000}`
   const variantNCC = await fetchNccConstraintRegionById(ctx.esClient, variantNCCId)
diff --git a/graphql-api/src/graphql/types/query.graphql b/graphql-api/src/graphql/types/query.graphql
index 5e84f9da5..2ed7544a8 100644
--- a/graphql-api/src/graphql/types/query.graphql
+++ b/graphql-api/src/graphql/types/query.graphql
@@ -1,24 +1,42 @@
-type Query{
-    gene(gene_id: String, gene_symbol: String, reference_genome: ReferenceGenomeId!): Gene @cost(value: 1)
-    region(chrom: String!, start: Int!, stop: Int!, reference_genome: ReferenceGenomeId!): Region! @cost(value: 1)
-    transcript(transcript_id: String!, reference_genome: ReferenceGenomeId!): Transcript @cost(value: 1)
+type Query {
+  gene(gene_id: String, gene_symbol: String, reference_genome: ReferenceGenomeId!): Gene
+    @cost(value: 1)
+  region(chrom: String!, start: Int!, stop: Int!, reference_genome: ReferenceGenomeId!): Region!
+    @cost(value: 1)
+  transcript(transcript_id: String!, reference_genome: ReferenceGenomeId!): Transcript
+    @cost(value: 1)
 
-    clinvar_variant(variant_id: String!, reference_genome: ReferenceGenomeId!): ClinVarVariantDetails @cost(value: 1)
-    mitochondrial_variant(variant_id: String, dataset: DatasetId!): MitochondrialVariantDetails @cost(value: 1)
-    multiNucleotideVariant(variant_id: String!, dataset: DatasetId!): MultiNucleotideVariantDetails @cost(value: 1)
-    structural_variant(variantId: String!, dataset: StructuralVariantDatasetId!): StructuralVariantDetails @cost(value: 1)
-    variant(variantId: String, rsid: String, dataset: DatasetId!): VariantDetails @cost(value: 1)
-    copy_number_variant(variantId: String!, dataset: CopyNumberVariantDatasetId!): CopyNumberVariantDetails @cost(value: 1)  
-    gene_search(query: String!, reference_genome: ReferenceGenomeId!): [GeneSearchResult!]!
-    variant_search(query: String!, dataset: DatasetId!): [VariantSearchResult!]!
+  clinvar_variant(variant_id: String!, reference_genome: ReferenceGenomeId!): ClinVarVariantDetails
+    @cost(value: 1)
+  mitochondrial_variant(variant_id: String, dataset: DatasetId!): MitochondrialVariantDetails
+    @cost(value: 1)
+  multiNucleotideVariant(variant_id: String!, dataset: DatasetId!): MultiNucleotideVariantDetails
+    @cost(value: 1)
+  structural_variant(
+    variantId: String!
+    dataset: StructuralVariantDatasetId!
+  ): StructuralVariantDetails @cost(value: 1)
+  variant(variantId: String, rsid: String, vrsId: String, dataset: DatasetId!): VariantDetails
+    @cost(value: 1)
+  copy_number_variant(
+    variantId: String!
+    dataset: CopyNumberVariantDatasetId!
+  ): CopyNumberVariantDetails @cost(value: 1)
+  gene_search(query: String!, reference_genome: ReferenceGenomeId!): [GeneSearchResult!]!
+  variant_search(query: String!, dataset: DatasetId!): [VariantSearchResult!]!
 
-    liftover(source_variant_id: String, liftover_variant_id: String, reference_genome: ReferenceGenomeId!): [LiftoverResult!]!
+  liftover(
+    source_variant_id: String
+    liftover_variant_id: String
+    reference_genome: ReferenceGenomeId!
+  ): [LiftoverResult!]!
 
-    variant_cooccurrence(variants: [String!]!, dataset: DatasetId!): VariantCooccurrence @cost(value: 5)
+  variant_cooccurrence(variants: [String!]!, dataset: DatasetId!): VariantCooccurrence
+    @cost(value: 5)
 
-    short_tandem_repeat(id: String!, dataset: DatasetId!): ShortTandemRepeatDetails @cost(value: 1)
+  short_tandem_repeat(id: String!, dataset: DatasetId!): ShortTandemRepeatDetails @cost(value: 1)
 
-    short_tandem_repeats(dataset: DatasetId!): [ShortTandemRepeat!]! @cost(value: 10)
+  short_tandem_repeats(dataset: DatasetId!): [ShortTandemRepeat!]! @cost(value: 10)
 
-    meta: BrowserMetadata!
+  meta: BrowserMetadata!
 }
diff --git a/graphql-api/src/graphql/types/va.graphql b/graphql-api/src/graphql/types/va.graphql
new file mode 100644
index 000000000..7a00ecfcb
--- /dev/null
+++ b/graphql-api/src/graphql/types/va.graphql
@@ -0,0 +1,116 @@
+type VANumber {
+  type: String!
+  value: Int!
+}
+
+type VADefiniteRange {
+  type: String!
+  min: Float!
+  max: Float!
+}
+
+enum VAComparator {
+  LTE
+  GTE
+}
+
+type VAIndefiniteRange {
+  type: String!
+  value: Float!
+  comparator: VAComparator!
+}
+
+union VANumberlike = VADefiniteRange | VAIndefiniteRange | VANumber
+
+type VASequenceInterval {
+  type: String!
+  start: VANumber!
+  end: VANumber!
+}
+
+type VACytobandInterval {
+  type: String!
+  start: String!
+  end: String!
+}
+
+type VALiteralSequenceExpression {
+  type: String!
+  sequence: String!
+}
+
+type VASequenceLocation {
+  _id: String
+  type: String!
+  sequence_id: String!
+  interval: VASequenceInterval!
+}
+
+type VAAllele {
+  _id: String
+  type: String!
+  location: VASequenceLocation!
+  state: VALiteralSequenceExpression!
+}
+
+type VACohortAlleleFrequencyDerivation {
+  id: String
+  type: String
+  label: String
+  version: String
+}
+
+type VACohortCharacteristic {
+  name: String!
+  value: String!
+}
+
+type VACohort {
+  id: String!
+  label: String
+  characteristics: [VACohortCharacteristic!]
+}
+
+type VAGrpMaxFAF95 {
+  frequency: Float!
+  confidenceInterval: Float!
+  groupId: String!
+}
+
+type VAAncillaryResults {
+  grpMaxFAF95: VAGrpMaxFAF95
+  jointGrpMaxFAF95: VAGrpMaxFAF95
+  homozygotes: Int
+  hemizygotes: Int
+}
+
+type VAQualityMeasures {
+  meanDepth: Float
+  fractionCoverage20x: Float
+  qcFilters: [String!]
+  monoallelic: Boolean
+  lowComplexityRegion: Boolean
+  lowConfidenceLossOfFunctionError: Boolean
+  lossOfFunctionWarning: Boolean
+  noncodingTranscriptError: Boolean
+  heterozygousSkewedAlleleCount: Int
+}
+
+type VACohortAlleleFrequency {
+  id: String!
+  type: String!
+  label: String
+  derivedFrom: VACohortAlleleFrequencyDerivation
+  focusAllele: VAAllele!
+  focusAlleleCount: Int!
+  locusAlleleCount: Int!
+  alleleFrequency: Float!
+  cohort: VACohort!
+  ancillaryResults: VAAncillaryResults
+  subcohortFrequency: [VACohortAlleleFrequency]
+}
+
+type VA {
+  va: [VACohortAlleleFrequency!]
+  vrs: [VAAllele!]
+}
diff --git a/graphql-api/src/graphql/types/variant.graphql b/graphql-api/src/graphql/types/variant.graphql
index e88549750..850d23cd0 100644
--- a/graphql-api/src/graphql/types/variant.graphql
+++ b/graphql-api/src/graphql/types/variant.graphql
@@ -189,6 +189,10 @@ type Variant {
 
   # Deprecated - use hgvsp and hgvsc instead
   hgvs: String
+
+  # GA4GH-format data
+  va: [VACohortAlleleFrequency!]
+  vrs: VAAllele
 }
 
 type Coverage {
@@ -315,6 +319,9 @@ type VariantDetails {
   colocatedVariants: [String!]!
   multiNucleotideVariants: [MultiNucleotideVariantSummary!]
   sortedTranscriptConsequences: [TranscriptConsequence!]
+
+  va: [VACohortAlleleFrequency!]
+  vrs: VAAllele
 }
 
 type VariantSearchResult {
diff --git a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
index 627fd402b..60bd0b73d 100644
--- a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
+++ b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
@@ -51,14 +51,27 @@ const countVariantsInRegion = async (esClient: any, region: any, _subset: Subset
 // Variant query
 // ================================================================================================
 
-const fetchVariantById = async (esClient: any, variantIdOrRsid: any, subset: Subset) => {
-  const idField = isRsId(variantIdOrRsid) ? 'rsids' : 'variant_id'
+const isVrsId = (id: string) => /^ga4gh:/.test(id)
+
+const chooseIdField = (variantId: string) => {
+  if (isRsId(variantId)) {
+    return 'rsids'
+  }
+
+  if (isVrsId(variantId)) {
+    return 'allele_id'
+  }
+  return 'variant_id'
+}
+
+const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) => {
+  const idField = chooseIdField(variantId)
   const response = await esClient.search({
     index: GNOMAD_V4_VARIANT_INDEX,
     body: {
       query: {
         bool: {
-          filter: { term: { [idField]: variantIdOrRsid } },
+          filter: { term: { [idField]: variantId } },
         },
       },
     },