Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-5385 Add mitochondrial data from Gnomad v3.1.2 #687

Merged
merged 3 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
gnomAD Mitochondrial DNA (mtDNA) variants v3.1:
URL: https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1/vcf/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz

Mapping file in ticket BIOINFO-99: mapping_file_gnomad_mt_mod_file.txt

Script to preprocess original VCF from gnomad: gnomad_mt.py

Script to load gnomad mt variants into OpenCGA and export them in json format annotation.populationFrequencies object: opencga_gnomad_mt.sh


120 changes: 120 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/gnomad_mt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import sys
import gzip


POPULATIONS = ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid']
HEADER_COMMON = [
'##INFO=<ID=AC,Number=1,Type=Integer,Description="Calculated allele count">',
'##INFO=<ID=AF,Number=1,Type=Float,Description="Calculated allele frequency">',
'##INFO=<ID=GTC,Number=1,Type=String,Description="Calculated list of genotype counts (0/0,0/1,1/1)">'
]
HEADER_POP = [
'##INFO=<ID=AF_{pop},Number=1,Type=Float,Description="Calculated allele frequency for {pop} population">',
'##INFO=<ID=AC_{pop},Number=1,Type=Integer,Description="Calculated allele count for {pop} population">',
'##INFO=<ID=AN_{pop},Number=1,Type=Integer,Description="Calculated allele number for {pop} population">',
'##INFO=<ID=GTC_{pop},Number=1,Type=String,Description="Calculated list of genotype counts for {pop} population (0/0,0/1,1/1)">'
]


def main():

# Creating custom header
custom_header = []
custom_header += HEADER_COMMON
for pop in POPULATIONS:
custom_header += ['\n'.join(HEADER_POP).format(pop=pop)]
custom_header = '\n'.join(custom_header) + '\n'

# Opening input/output files
vcf_input_fpath = sys.argv[1]
vcf_output_fpath = sys.argv[2]
vcf_input_fhand = gzip.open(vcf_input_fpath, 'r')
vcf_output_fhand = gzip.open(vcf_output_fpath, 'wt')

# Calculating new INFO fields for each variant
for line in vcf_input_fhand:
line = line.decode()

# Writing header to output
if line.startswith('##VEP'): # adding custom header before "##VEP" line
vcf_output_fhand.write(custom_header)
vcf_output_fhand.write(line)
continue
if line.startswith('#'):
vcf_output_fhand.write(line)
continue

# Dict to store the new calculated data
new_info = {}

# Getting variant and INFO data
variant_items = line.strip().split()
info_items = variant_items[7].split(';')

for info_item in info_items:

# Getting key/value for each INFO item
if len(info_item.split('=', maxsplit=1)) < 2: # skipping flags
continue
info_key, info_value = info_item.split('=', maxsplit=1)

# Getting INFO data for calculations
if info_key == 'pop_AF_hom':
pop_AF_hom = list(map(float, info_value.split('|')))
if info_key == 'pop_AF_het':
pop_AF_het = list(map(float, info_value.split('|')))
if info_key == 'AF_hom':
AF_hom = float(info_value)
if info_key == 'AF_het':
AF_het = float(info_value)
if info_key == 'pop_AC_hom':
pop_AC_hom = list(map(int, info_value.split('|')))
if info_key == 'pop_AC_het':
pop_AC_het = list(map(int, info_value.split('|')))
if info_key == 'AC_hom':
AC_hom = int(info_value)
if info_key == 'AC_het':
AC_het = int(info_value)
if info_key == 'pop_AN':
pop_AN = list(map(int, info_value.split('|')))
if info_key == 'AN':
AN = int(info_value)

# Calculating AF_{pop} and AF
# e.g. AF_sas = pop_AF_hom[i] + pop_AF_het[i] (i = index of sas population)
pop_AF = [x + y for x, y in zip(pop_AF_hom, pop_AF_het)]
for i, pop in enumerate(POPULATIONS):
new_info['AF_' + pop] = pop_AF[i]
new_info['AF'] = AF_hom + AF_het

# Calculating AC_{pop} and AC
# e.g. AC_sas = pop_AC_hom[i] + pop_AC_het[i] (i = index of sas population)
pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
for i, pop in enumerate(POPULATIONS):
new_info['AC_' + pop] = pop_AC[i]
new_info['AC'] = AC_hom + AC_het

# Calculating AN_{pop}
# e.g. AN_sas = pop_AN[i] (i = index of sas population)
for i, pop in enumerate(POPULATIONS):
new_info['AN_' + pop] = pop_AN[i]

# Calculating GTC_{pop}
# e.g. GTC_sas = (pop_AN[i] - (pop_AC_het[i] + pop_AC_hom[i])) + "," + pop_AC_het[i] + "," + pop_AC_hom[i]
pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
hom_ref = [x - y for x, y in zip(pop_AN, pop_AC)]
for i, pop in enumerate(POPULATIONS):
new_info['GTC_' + pop] = ','.join(map(str, [hom_ref[i], pop_AC_het[i], pop_AC_hom[i]]))
new_info['GTC'] = ','.join(map(str, [AN - (AC_hom + AC_het), AC_het, AC_hom]))

# Joining existing INFO field and new custom INFO data
custom_info_data = ';'.join(['='.join([k, str(new_info[k])]) for k in new_info])
new_info_field = ';'.join(info_items + [custom_info_data])

# Replacing original INFO field
variant_items[7] = new_info_field
vcf_output_fhand.write('\t'.join(variant_items) + '\n')


if __name__ == '__main__':
sys.exit(main())
44 changes: 44 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/opencga_gnomad_mt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

# Variables
user="user"
host="host_name"
project="population"
project_name="Population"
study="gnomad_mt"
study_name="gnomAD v3.1 Mitocondrial DNA Variants"
study_path="data/"$study
folder_path="/home/gnomad_mt"
mapping_file="mapping_file_gnomad_mt_mod_file.txt"
vcf_file="gnomad.genomes.v3.1.sites.chrM.mod.vcf.gz"
mapping_file_path=$folder_path$mapping_file
vcf_file_path=$folder_path$vcf_file

# Login
/home/opencga-client-2.12.0/bin/opencga.sh login $user --host $host

# Project creation
/home/opencga-client-2.12.0/bin/opencga.sh projects create --id $project --name $project_name --organism-scientific-name hsapiens --organism-assembly grch38 --host $host

# Study creation
/home/opencga-client-2.12.0/bin/opencga.sh studies create --id $study --name $study_name --project $project --host $host

# Folders creation within Catalog
/home/opencga-client-2.12.0/bin/opencga.sh files create --path $study_path --parents --study $study --type DIRECTORY --host $host

# Uploading gnomad mt variants VCF and mapping file for gnomad mt variants
/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $mapping_file_path --path $study_path --study $study --host $host

/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $vcf_file_path --path $study_path --study $study --host $host

# Variant index for gnomad mt variants VCF
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-index --study $study --file $vcf_file --load-archive NO --load-split-data CHROMOSOME --host $host

# Variant stats index for gnomad mt variants. The corresponding cohorts and variant cohort stats will be generated using the information of interest provided in the mapping file and INFO column of the gnomad mt VCF
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-stats-index --study $study --aggregation-mapping-file $mapping_file --aggregated BASIC --host $host

# Variant cohort stats will be converted to population frequencies data model (julie-tool)
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-julie-run --project $project --host $host

# Export of annotation.populationFrequencies in json format
/home/opencga-client-2.12.0/bin/opencga.sh variant export-run --body_include annotation.populationFrequencies --body_project $project --project $project --output-file-format json --host $host
Loading