-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Filter out indels that are actually STRs, as well as duplicate STRs. (#…
…227) * first attempt * click.command * reduce logging statements * Update remove_STR_indels.py * reduce memory usage * Update remove_STR_indels.py * trying to stop the warning check * update docu * black * write to analysis * move the folder into str directory * whole copies only
- Loading branch information
1 parent
b4824f4
commit 724d283
Showing
1 changed file
with
193 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This script filters raw associaTR outputs that contain both eSTR and eSNP results (i.e. we assume that `dataframe_concatenator.py` has been run). | ||
Particularly, we remove indels that actually represent STRs, AND remove duplicate eSTRs (retain only one eSTR per duplicate set). | ||
This is necessary to improve the accuracy of fine-mapping. | ||
Indels are considered STRs (and subsequently removed) if: | ||
- they overlap with an STR region, specified by an eSTR in the associaTR output; and | ||
- the (reverse complement) sequence of the indel is a whole copy of at least one cyclical representation of the STR motif. | ||
e.g. a 'TAT' insertion overlapping with a 'ATT' STR would be considered an STR as 'TAT' is a cyclical representation of 'ATT'. | ||
a 'GC' insertion overlapping with a 'CAG' STR would not be considered an STR, as 'GC' is a partial copy of 'GCA', a cyclical representation of 'CAG'. | ||
Note that impure indels are conservatively not considered STRs. For example, a 'GCCGCA' insertion overlapping a 'GCC' STR would not be considered an STR. | ||
This script additionally removes duplicate eSTRs (defined by sharing the same coordinates and motif), retaining only one eSTR per duplicate set (chosen based on having the lowest p-value). | ||
Usage: | ||
analysis-runner --dataset bioheart --output-dir str/associatr/snps_and_strs/rm_str_indels_dup_strs/tob_n1055_and_bioheart_n990/meta_results \ | ||
--description "Remove STR indels and duplicate eSTRs" \ | ||
--access-level "test" \ | ||
remove_STR_indels.py \ | ||
--associatr-dir=gs://cpg-bioheart-test/str/associatr/snps_and_strs/tob_n1055_and_bioheart_n990/meta_results/meta_results \ | ||
--celltypes=gdT,B_intermediate,ILC,Plasmablast,dnT,ASDC,cDC1,pDC,NK_CD56bright,MAIT,B_memory,CD4_CTL,CD4_Proliferating,CD8_Proliferating,HSPC,NK_Proliferating,cDC2,CD16_Mono,Treg,CD14_Mono,CD8_TCM,CD4_TEM,CD8_Naive,CD4_TCM,NK,CD8_TEM,CD4_Naive,B_naive \ | ||
--chromosomes=chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22 | ||
""" | ||
import ast | ||
|
||
import click | ||
import numpy as np | ||
|
||
import hailtop.batch as hb | ||
from hailtop.batch import ResourceGroup | ||
|
||
from cpg_utils.config import output_path | ||
from cpg_utils.hail_batch import get_batch | ||
|
||
|
||
def check_str(motif): | ||
""" | ||
Checks if a row in a df represents an STR (i.e. the motif does not contain a '-') | ||
""" | ||
return '-' not in motif | ||
|
||
|
||
def reverse_complement(sequence): | ||
""" | ||
Return the reverse complement of a DNA sequence. | ||
""" | ||
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} | ||
reverse_sequence = sequence[::-1] | ||
reverse_complement_sequence = ''.join(complement_dict.get(base, base) for base in reverse_sequence) | ||
return reverse_complement_sequence | ||
|
||
|
||
def cyclical_shifts(s): | ||
""" | ||
Generate all cyclical shifts of a string. | ||
""" | ||
return [s[i:] + s[:i] for i in range(len(s))] | ||
|
||
|
||
def filter_str_indels_and_duplicates(associatr_dir, celltype, chrom): | ||
import pandas as pd | ||
|
||
from cpg_utils import to_path | ||
|
||
# Load associaTR output | ||
gene_files = list(to_path(f'{associatr_dir}/{celltype}/{chrom}').glob('*.tsv')) | ||
for gene_file in gene_files: | ||
data = pd.read_csv(str(gene_file), sep='\t') | ||
gene_file_name = str(gene_file).split('/')[-1] | ||
print(f'Processing {gene_file_name}...') | ||
|
||
# Find duplicate eSTRs and remove all but the one with the lowest p-value | ||
duplicates = data[data.duplicated(subset=['chr', 'pos', 'motif'], keep=False)] | ||
# Group by 'chr', 'pos', 'motif' and keep the one with the lowest 'p-val' | ||
lowest_pval_duplicates = duplicates.loc[duplicates.groupby(['chr', 'pos', 'motif'])['pval_meta'].idxmin()] | ||
# Find non-duplicate rows | ||
non_duplicates = data.drop(duplicates.index) | ||
# Concatenate the non-duplicates with the lowest p-value duplicates | ||
result_df = pd.concat([non_duplicates, lowest_pval_duplicates]).sort_index() | ||
|
||
# get df containing only STRs | ||
result_df_str = result_df[result_df['motif'].apply(check_str)] | ||
result_df_str.loc[:, 'end'] = ( | ||
result_df_str.loc[:, 'pos'] + result_df_str.loc[:, 'ref_len'] * result_df_str.loc[:, 'period'] | ||
) | ||
|
||
indices_to_delete = [] # store indices of indels representing STRs (to drop later) | ||
for i, row1 in result_df.iterrows(): # iterate through every variant record in df | ||
if '-' not in row1['motif']: # skip if STR | ||
continue | ||
parts = row1['motif'].split('-') | ||
if len(parts[0]) == 1 and len(parts[1]) == 1: # skip if SNP (single base change) | ||
continue | ||
|
||
pos_indel = row1['pos'] | ||
# Split the motif to find the indel | ||
motif_parts = row1['motif'].split('-') | ||
|
||
indel = next((part for part in motif_parts if len(part) != 1), None)[1:] | ||
|
||
found = False | ||
for j, row2 in result_df_str.iterrows(): | ||
if ( | ||
row2['pos'] <= pos_indel <= row2['end'] | ||
): # Check if pos_indel is between 'pos' and 'end' of any row in STR dataframe | ||
str_motif = row2['motif'] | ||
|
||
if (indel == str_motif) or ( | ||
reverse_complement(indel) == str_motif | ||
): # straightforward case where (reverse complement of) indel matches STR motif exactly | ||
# print(f"Indel: {row1['motif']} {indel} matches motif {str_motif}") | ||
indices_to_delete.append(i) | ||
found = True | ||
break | ||
|
||
if len(indel) % len(str_motif) == 0: # check if len(indel) is a multiple of len(str_motif) | ||
elongated_motif = str_motif * (len(indel) // len(str_motif)) | ||
if (indel in cyclical_shifts(elongated_motif)) or ( | ||
reverse_complement(indel) in cyclical_shifts(elongated_motif) | ||
): | ||
indices_to_delete.append(i) | ||
found = True | ||
break | ||
|
||
found = True | ||
break | ||
# if not found: | ||
# print("Indel not in STR interval") | ||
# Drop the rows where indels are representing STRs | ||
result_df = result_df.drop(indices_to_delete) | ||
result_df.to_csv(output_path(f'{celltype}/{chrom}/{gene_file_name}', 'analysis'), sep='\t', index=False) | ||
|
||
|
||
@click.option('--associatr-dir', required=True, type=str, help='Directory containing associaTR outputs.') | ||
@click.option('--celltypes', required=True, type=str, help='Comma-separated list of cell types to process.') | ||
@click.option('--chromosomes', required=True, type=str, help='Comma-separated list of chromosomes to process.') | ||
@click.option( | ||
'--max-parallel-jobs', | ||
required=True, | ||
type=int, | ||
help='Maximum number of jobs to run in parallel.', | ||
default=50, | ||
) | ||
@click.option('--job-cpu', required=False, type=float, help='Number of CPUs to use per job.', default=0.25) | ||
@click.option('--job-storage', required=False, type=str, help='Storage to use per job.', default='0G') | ||
@click.command() | ||
def main( | ||
associatr_dir: str, | ||
celltypes: str, | ||
chromosomes: str, | ||
max_parallel_jobs: int, | ||
job_cpu: int, | ||
job_storage: str, | ||
): | ||
# Setup MAX concurrency by genes | ||
_dependent_jobs: list[hb.batch.job.Job] = [] | ||
|
||
def manage_concurrency_for_job(job: hb.batch.job.Job): | ||
""" | ||
To avoid having too many jobs running at once, we have to limit concurrency. | ||
""" | ||
if len(_dependent_jobs) >= max_parallel_jobs: | ||
job.depends_on(_dependent_jobs[-max_parallel_jobs]) | ||
_dependent_jobs.append(job) | ||
|
||
b = get_batch(name='Remove STR indels and duplicate eSTRs') | ||
for celltype in celltypes.split(','): | ||
for chrom in chromosomes.split(','): | ||
filter_job = b.new_python_job( | ||
f'Remove STR-indels and duplicate eSTRs {celltype}:{chrom}', | ||
) | ||
filter_job.cpu(job_cpu) | ||
filter_job.storage(job_storage) | ||
|
||
filter_job.call(filter_str_indels_and_duplicates, associatr_dir, celltype, chrom) | ||
manage_concurrency_for_job(filter_job) | ||
|
||
b.run(wait=False) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |