Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare input files for FINEMAP #232

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions str/fine-mapping/finemap_files_prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python3

"""

This script will prepare the z and ld files for FINEMAP, based on outputs from associaTR (meta-analysis) and corr_matrix_maker.py.

The z file contains the effect size and standard error estimates for each variant associated with a gene.
The ld file contains the correlation matrix calcualtions for each variant associated with a gene.

analysis-runner --dataset "bioheart" \
--description "Prepare files for FINEMAP" \
--access-level "test" \
--output-dir "str/associatr/fine_mapping" \
finemap_files_prep.py \
--ld-dir "gs://cpg-bioheart-test-analysis/str/associatr/fine_mapping/prep_files/v2-whole-copies-only/correlation_matrix" \
--associatr-dir "gs://cpg-bioheart-test-analysis/str/associatr/snps_and_strs/rm_str_indels_dup_strs/v2-whole-copies-only/tob_n1055_and_bioheart_n990/meta_results" \
--celltypes "ASDC" \
--chroms "chr22"



"""

import click
import numpy as np
import pandas as pd

import hailtop.batch as hb

from cpg_utils import to_path
from cpg_utils.config import output_path
from cpg_utils.hail_batch import get_batch


def z_file_maker(gene_name, ld_file, associatr_dir, celltype, chrom):
"""
Prepares the 'z file' for FINEMAP based on the output from associaTR (meta-analysis).

"""

# Load the TSV file
df = pd.read_csv(f'{associatr_dir}/{celltype}/{chrom}/{gene_name}_100000bp_meta_results.tsv', sep='\t')

# Create the 'rsid' column
df['rsid'] = df['chr'].astype(str) + ':' + df['pos'].astype(str) + '_' + df['motif']

# Rename the 'chr' and 'pos' columns
df = df.rename(columns={'chr': 'chromosome', 'pos': 'position', 'coeff_meta': 'beta', 'se_meta': 'se'})

# Create the 'allele1', 'allele2', 'maf' columns with NaN values
df['allele1'] = 'nan'
df['allele2'] = 'nan'
df['maf'] = 'nan'

# Select the required columns
df = df[['rsid', 'chromosome', 'position', 'allele1', 'allele2', 'maf', 'beta', 'se']]

# Load the index file
df_index = pd.read_csv(ld_file, sep='\t')

# Set the index of df to be 'rsid'
df = df.set_index('rsid')

# Reindex df according to the order of 'rsid' in df_index
df = df.reindex(df_index['Unnamed: 0'])

# Reset the index of df
df = df.reset_index()
df = df.rename(columns={'Unnamed: 0': 'rsid'})

output_file_path = output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.z', 'analysis')
# Save the DataFrame to a space-delimited file
df.to_csv(output_file_path, sep=' ', index=False)


def ld_file_maker(gene_name, ld_file, celltype, chrom):
"""
Prepares the 'ld file' for FINEMAP based on the output from corr_matrix_maker.py.
"""

# Load the TSV file
df = pd.read_csv(ld_file, sep='\t')
df = df.drop(columns=['Unnamed: 0'])

# Convert the data type of the DataFrame to np.float64
df = df.astype(np.float64)

# Rounding to prevent float representation errors by FINEMAP
df = df.round(4)

# Remove column names
df.columns = ['' for _ in df.columns]

# Remove index name
df.index.names = [None]

output_file_path = output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.ld', 'analysis')
# Save the DataFrame to a space-delimited file without index
df.to_csv(output_file_path, sep=' ', index=False, header=False)


@click.option('--ld-dir', required=True, help='Path to the directory containing the LD files')
@click.option('--associatr-dir', required=True, help='Path to the directory containing the associatr files')
@click.option('--celltypes', required=True, help='Cell type')
@click.option('--chroms', required=True, help='Chromosome')
@click.option('--ld-job-cpu', help='CPU for LD job', default=0.25)
@click.option('--z-job-cpu', help='CPU for Z job', default=0.25)
@click.option('--max-parallel-jobs', help='Maximum number of parallel jobs to run', default=500)
@click.command()
def main(ld_dir, associatr_dir, celltypes, chroms, ld_job_cpu, z_job_cpu, max_parallel_jobs):
# Setup MAX concurrency by genes
_dependent_jobs: list[hb.batch.job.Job] = []

def manage_concurrency_for_job(job: hb.batch.job.Job):
"""
To avoid having too many jobs running at once, we have to limit concurrency.
"""
if len(_dependent_jobs) >= max_parallel_jobs:
job.depends_on(_dependent_jobs[-max_parallel_jobs])
_dependent_jobs.append(job)

b = get_batch(name=f'FINEMAP prep for {celltypes}: {chroms}')
for celltype in celltypes.split(','):
for chrom in chroms.split(','):
ld_files = list(to_path(f'{ld_dir}/{celltype}/{chrom}').glob('*.tsv'))
for ld_file in ld_files:
gene_name = str(ld_file).split('/')[-1].split('_')[0] # ENSG ID

if (
to_path(
output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.ld', 'analysis'),
).exists()
and to_path(output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.z', 'analysis')).exists()
):
continue
Comment on lines +129 to +135
Copy link
Contributor

@MattWellie MattWellie Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not a biggie, but be aware of your job scaling (type * chrom * gene), and you're creating 2 existence checks per combination. All that happens in the driver job so you're delaying the start of the actual work.

I'd experiment with a change here -

# get all files in the output folder, recursively, in a single query
all_files = list(to_path(output_path('finemap_prep', 'analysis')).glob('**'))

# check whether your intended outputs are in that list
ld_file = output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.ld', 'analysis')
z_file = output_path(f'finemap_prep/{celltype}/{chrom}/{gene_name}.z', 'analysis')
if all (filepath in all_files for filepath in [z_file, ld_file]):
    continue

I thiiiiink this should scale a lot better, by posting one large query instead of thousands of individual ones.

This also builds the full output file names, so you can pass them to the relevant methods (you pass the celltype, chrom, and gene name to your methods, but you already made the full path here to check if it exists)

ld_job = b.new_python_job(
f'LD file maker for {celltype}:{chrom} {gene_name}',
)
ld_job.cpu(ld_job_cpu)
ld_job.call(ld_file_maker, gene_name, ld_file, celltype, chrom)
manage_concurrency_for_job(ld_job)
z_job = b.new_python_job(f'Z file maker for {celltype}:{chrom} {gene_name}')
z_job.cpu(z_job_cpu)
z_job.call(z_file_maker, gene_name, ld_file, associatr_dir, celltype, chrom)
manage_concurrency_for_job(z_job)

b.run(wait=False)


if __name__ == '__main__':
main()
Loading