Skip to content

Commit

Permalink
Condition on the SNP genotype for eQTL analysis (#222)
Browse files Browse the repository at this point in the history
* add option to condition on certain SNP loci

* lint

* snp_input_dir added as a config param

* lint

* test

* remove to_csv because i know it works now

* use gt_types

* remove unneccessary code

* change coding of HOM ALT to 2
  • Loading branch information
hopedisastro authored May 23, 2024
1 parent 6c1c558 commit d44310f
Showing 1 changed file with 45 additions and 1 deletion.
46 changes: 45 additions & 1 deletion str/associatr/get_cis_numpy_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
- output gene-level phenotype and covariate numpy objects for input into associatr
analysis-runner --config get_cis_numpy_files.toml --dataset "bioheart" --access-level "test" \
--description "get cis and numpy" --output-dir "str/associatr/tob_n1055/input_files" \
--description "get cis and numpy" --output-dir "str/associatr/tob_n1055/tester" \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/scanpy:1.9.3 \
python3 get_cis_numpy_files.py
Expand All @@ -21,6 +21,7 @@
import numpy as np
import pandas as pd
import scanpy as sc
from cyvcf2 import VCF
from scipy.stats import norm

import hail as hl
Expand All @@ -31,6 +32,33 @@
from cpg_utils.hail_batch import get_batch, image_path, init_batch, output_path


def extract_genotypes(vcf_file, loci):
"""
Helper function to extract genotypes (SNPs) from a VCF file; target loci specified as a list (can be single or multiple)
"""
# Read the VCF file
vcf_reader = VCF(vcf_file)

results = pd.DataFrame()
results['sample_id'] = vcf_reader.samples

# Iterate through the records in the VCF file for each locus
for locus in loci:
chrom, pos = locus.split(':')
pos = int(pos)

for record in vcf_reader(f'{chrom}:{pos}-{pos}'):
if record.CHROM == chrom and record.POS == pos:
gt = record.gt_types
gt[gt == 3] = 2 #HOM ALT is coded as 3; change it to 2
results[locus] = gt
break

# Convert results to a DataFrame
return results


def cis_window_numpy_extractor(
input_h5ad_dir,
input_pseudobulk_dir,
Expand All @@ -42,6 +70,8 @@ def cis_window_numpy_extractor(
chrom_len,
min_pct,
remove_samples_file,
snp_input,
snp_loci,
):
"""
Creates gene-specific cis window files and phenotype-covariate numpy objects
Expand Down Expand Up @@ -106,6 +136,11 @@ def cis_window_numpy_extractor(

gene_pheno_cov = gene_pheno.merge(covariates, on='sample_id', how='inner')

# add SNP genotypes we would like to condition on
if snp_input is not None:
snp_genotype_df = extract_genotypes(snp_input['vcf'], snp_loci)
gene_pheno_cov = gene_pheno_cov.merge(snp_genotype_df, on='sample_id', how='inner')

# filter for samples that were assigned a CPG ID; unassigned samples after demultiplexing will not have a CPG ID
gene_pheno_cov = gene_pheno_cov[gene_pheno_cov['sample_id'].str.startswith('CPG')]

Expand Down Expand Up @@ -151,6 +186,13 @@ def manage_concurrency_for_job(job: hb.batch.job.Job):
j.cpu(get_config()['get_cis_numpy']['job_cpu'])
j.memory(get_config()['get_cis_numpy']['job_memory'])
j.storage(get_config()['get_cis_numpy']['job_storage'])

if get_config()['get_cis_numpy']['snp_vcf_dir'] is not None:
snp_vcf_dir = get_config()['get_cis_numpy']['snp_vcf_dir']
snp_vcf_path = f'{snp_vcf_dir}/{chrom}_common_variants.vcf.bgz'
snp_input = get_batch().read_input_group(**{'vcf': snp_vcf_path, 'csi': snp_vcf_path + '.csi'})
else: # no SNP VCF provided
snp_input = None
j.call(
cis_window_numpy_extractor,
get_config()['get_cis_numpy']['input_h5ad_dir'],
Expand All @@ -163,6 +205,8 @@ def manage_concurrency_for_job(job: hb.batch.job.Job):
chrom_len,
get_config()['get_cis_numpy']['min_pct'],
get_config()['get_cis_numpy']['remove_samples_file'],
snp_input,
get_config()['get_cis_numpy']['snp_loci'],
)

manage_concurrency_for_job(j)
Expand Down

0 comments on commit d44310f

Please sign in to comment.