-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
163 lines (127 loc) · 5.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import os
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import AlignIO
from Bio.Align.Applications import ClustalOmegaCommandline
import dendropy
from dendropy.calculate import treecompare
# get df of each species sheet
def read_data(sheet_name, exel_file='protein_tables.xlsx'):
df = pd.read_excel(exel_file, sheet_name)
return df
# STEP 1 : get common bacteria set
def get_common_bacteria_set(protein_set, species):
print("\n COMMON BACTERIA SET\n")
if os.path.exists('out/common_bacteria_set.txt'):
with open('out/common_bacteria_set.txt', 'r') as f:
common_bacteria_set = f.read().strip().split("\n")
for i in common_bacteria_set:
print(i)
else:
common_bacteria_set = []
for _ in species:
specie_df = read_data(_)
proteins_set_in_specie = set(specie_df['Protein name'])
set_intersection = set(protein_set).intersection(proteins_set_in_specie)
if set_intersection == set(protein_set):
common_bacteria_set.append(_)
with open("out/common_bacteria_set.txt", "w") as f:
for i in common_bacteria_set:
f.write(i + "\n")
print(i)
print("\n")
return common_bacteria_set
# STEP 3 : extract gene sequence for each protein in protein_set and for each species in common_bacteria_set
def get_homologous_gene_sequences(protein_set, common_bacteria_set):
print(" HOMOLOGOUS GENE SEQUENCE")
if not os.path.exists('out/genomes.txt'):
genomes = {}
for p in protein_set:
new_p = p.replace(" ", "_")
genomes_of_p = []
species = []
print(" ", p, " \n")
for c_b in list(common_bacteria_set):
species.append(c_b)
df = read_data(c_b)
df_p = df.loc[df['Protein name'] == p]
first_index = df_p.first_valid_index()
start, stop = df_p.loc[first_index]['Start'], df_p.loc[first_index]['Stop']
for seq_record in SeqIO.parse(open('fasta/{}.fasta'.format(c_b)), "fasta"):
c_b_seq = seq_record.seq[start:stop + 1]
genomes_of_p.append(SeqRecord(c_b_seq, c_b, new_p, ""))
genomes['p'] = genomes_of_p
SeqIO.write(genomes_of_p, "out/homologous_gene_sequences/{}.fasta".format(new_p), "fasta")
with open('out/genomes.txt', 'a') as genomes_file:
genomes_file.write(p + " - " + str(genomes_of_p))
# STEP 4 : build phylogenetic trees
def build_phylogeny_trees():
path = "out/homologous_gene_sequences/"
out_path = "out/aligned_homologous_gene_sequences/"
for homologous_gene_sequence in os.listdir(path):
in_file = path + homologous_gene_sequence
out_file = out_path + homologous_gene_sequence
if not os.path.exists(out_file):
clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)
os.system(str(clustalomega_cline))
msa = AlignIO.read(out_file, 'fasta')
# Calculate the distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(msa)
# Print the distance Matrix
print('\nDistance Matrix\n===================')
print(dm)
# Construct the phylogenetic tree using UPGMA algorithm
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
# Draw the phylogenetic tree
Phylo.draw(tree)
# Print the phylogenetic tree in the terminal
print('\nPhylogenetic Tree\n', homologous_gene_sequence)
Phylo.draw_ascii(tree)
Phylo.write([tree], 'out/trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
def calculate_tree_distance():
tree_distances = {}
print(" TREE DISTANCES\n")
for i in os.listdir('out/trees'):
tree_distances[i] = {}
print(i, "\n")
for j in os.listdir('out/trees'):
if i >= j:
continue
else:
tns = dendropy.TaxonNamespace()
tree1 = dendropy.Tree.get_from_path(
'out/trees/{}'.format(i),
'nexus',
taxon_namespace=tns,)
tree2 = dendropy.Tree.get_from_path(
'out/trees/{}'.format(j),
"nexus",
taxon_namespace=tns)
tree1.encode_bipartitions()
tree2.encode_bipartitions()
tree_distances[i][j] = round(treecompare.weighted_robinson_foulds_distance(tree1, tree2),3)
print("\n")
with open('out/tree_distances.json', 'w') as f:
json.dump(tree_distances, f)
if __name__ == '__main__':
protein_set = ["site-specific DNA-methyltransferase",
"LysR family transcriptional regulator",
"helix-turn-helix domain-containing protein",
"efflux transporter outer membrane subunit"]
excel_file = pd.ExcelFile('protein_tables.xlsx')
species = excel_file.sheet_names
# STEP 1
common_bacteria_set = get_common_bacteria_set(protein_set, species)
# STEP 3
get_homologous_gene_sequences(protein_set, common_bacteria_set)
# STEP 4
build_phylogeny_trees()
# STEP 5
calculate_tree_distance()