-
Notifications
You must be signed in to change notification settings - Fork 0
/
tcga_distance_to_tss.py
67 lines (59 loc) · 2.51 KB
/
tcga_distance_to_tss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import glob
import os
import pandas as pd
import argparse
def get_args():
'''
Loads the parser
'''
# Main parser
parser = argparse.ArgumentParser(description="get closest distance to tss")
# Args
required = parser.add_argument_group("Required input parameters")
# Metadata from input table
required.add_argument('-g', '--gene_block', required=True, help='Gene block.')
return parser.parse_args()
def Distance_to_tss(bedpe, chr, tss):
wgs = pd.read_csv(bedpe, sep='\t')
chrom=[]
bpt=[]
for index, row in wgs.drop(wgs.index[len(wgs)-1]).iterrows():
if row['end'] +1 == wgs.iloc[index +1]['start'] and row['total_cn']!=wgs.iloc[index +1]['total_cn']:
bpt.append(row['end'])
chrom.append(row['chromosome'])
select = aliquot_id_data[aliquot_id_data.Chromosome.astype(str) == str(chr)]
distance_1 = select.Start - int(tss)
distance_2 = select.End - int(tss)
total_distance = pd.concat([distance_1, distance_2]).drop_duplicates()
min_abs_distance = total_distance.abs().min()
return min_abs_distance
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
if __name__ == '__main__':
args = get_args()
gb = int(args.gene_block)
print('preparing')
pcawg_input = glob.glob('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/CNV/*.txt')
map1=pd.read_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/gencode_hg38_56354gene.csv')
total_gene_list=list(chunks(map1.gene_id2.tolist(), 54))
total_chrom_list=list(chunks(map1.chr.tolist(), 54))
total_tss_list=list(chunks(map1.TSS.tolist(), 54))
gene_list = total_gene_list[gb]
chrom_list = total_chrom_list[gb]
tss_list = total_tss_list[gb]
table_list = []
for i in range(len(gene_list)):
print(gene_list[i])
table = pd.DataFrame(columns=['CNV_file_name', gene_list[i]])
for bedpe in pcawg_input:
print('processing {}'.format(os.path.basename(bedpe).split('.')[0]))
aliquot_id = os.path.basename(bedpe)
aliquot_dict = Distance_to_tss(bedpe, chrom_list[i], tss_list[i])
table = table.append({'CNV_file_name':aliquot_id, gene_list[i]:aliquot_dict},ignore_index=True)
table_list.append(table)
df = table_list[0]
for df_ in table_list[1:]:
df = df.merge(df_, on='CNV_file_name')
df.to_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/TCGA_Table_distance_to_tss.{}.csv'.format(gb), index=False)