-
Notifications
You must be signed in to change notification settings - Fork 1
/
produce_entropy_plots.py
258 lines (216 loc) · 10.3 KB
/
produce_entropy_plots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# code to generate figures 6 and 7 in the paper
import os, io
import re
import random
import urllib.request
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from umap import UMAP
import scipy
from scipy.cluster.hierarchy import ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform, pdist
from scipy_cut_tree_balanced import cut_tree_balanced
import sklearn.metrics as metrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.cluster import adjusted_rand_score
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
def check_symmetric(arr):
if arr.shape[0] != arr.shape[1]:
raise ValueError("The given array is not square!")
non_symmetric_indices = []
n = arr.shape[0]
for i in range(n):
for j in range(i+1, n): # Only check the upper triangle
if arr[i, j] != arr[j, i]:
non_symmetric_indices.append((i, j))
return non_symmetric_indices
def make_symmetric(mat):
rows, cols = mat.shape
for i in range(rows):
for j in range(i + 1, cols): # only consider upper triangular part
if mat[i, j] != mat[j, i]: # unsymmetrical
# Take the minimum of the two unsymmetrical entries
symmetric_value = min(mat[i, j], mat[j, i])
mat[i, j] = symmetric_value
mat[j, i] = symmetric_value
return mat
def cluster_purity(cluster_id, clusters, assigned_labels):
assigned_labels = np.array(assigned_labels)
unique, counts = np.unique(assigned_labels[np.where(clusters == cluster_id)[0]], return_counts=True)
return counts.max() / np.where(clusters == cluster_id)[0].shape[0]
def weighted_average_purity(assigned_labels, clusters):
unique_clusters = np.unique(clusters)
total_weight = len(assigned_labels)
total_sum = sum(cluster_purity(cluster_id, clusters, assigned_labels) * np.where(clusters == cluster_id)[0].shape[0] for cluster_id in unique_clusters)
return total_sum / total_weight
def cluster_entropy(cluster_id, clusters, assigned_labels):
assigned_labels = np.array(assigned_labels)
n = len(np.unique(assigned_labels))
proportions = np.array([np.sum(assigned_labels[np.where(clusters == cluster_id)[0]] == label) for label in np.unique(assigned_labels)]) / np.where(clusters == cluster_id)[0].shape[0]
entropy_terms = [-p * np.log2(p) if p > 0 else 0 for p in proportions]
entropy = sum(entropy_terms)
return entropy / np.log2(n)
def weighted_average_entropy(assigned_labels, clusters):
unique_clusters = np.unique(clusters)
total_weight = len(assigned_labels)
total_sum = sum(cluster_entropy(cluster_id, clusters, assigned_labels) * np.where(clusters == cluster_id)[0].shape[0] for cluster_id in unique_clusters)
return total_sum / total_weight
#for each Target retrieve info about Activity and Factor
marks = pd.read_excel("./data/target_activity_factor.xlsx")
marks.columns = ["Target", "Activity", "Factor"]
marks['Activity'].value_counts()
marks['Factor'].value_counts()
#retrieve hg38-aligned dataset metadata
df38 = pd.read_csv("./data/genome_df38.csv", delimiter=",")
df38 = pd.DataFrame(df38)
df38 = df38.loc[:, ['Accession', 'Target', 'Biosample term name', 'Genome']]
df38['Target'].value_counts()
df38['Biosample term name'].value_counts()
merged_df = pd.merge(df38, marks, on='Target')
#get hierarchical clustering linkages fro all 23 chrs (23rd is X chr)
list_linkages = []
for i in range(1, 24):
chr_id = i
if chr_id == 23: #this is chromosome X
chr_id = 'X'
df_corr = pd.read_csv("./results38/hg38_chr" + str(chr_id) + "_200data" + 'correlation.h5', index_col=0)
cor_dist = df_corr.to_numpy()
np.fill_diagonal(cor_dist, 0)
indices = check_symmetric(cor_dist)
if len(indices) != 0:
cor_dist = make_symmetric(cor_dist)
condensed_dist = squareform(cor_dist)
linkresult = sch.linkage(condensed_dist, method = "complete")
linkresult[linkresult < 0] = 0
list_linkages.append(linkresult)
#producing the FIGURE 6 in the paper
colors = list(cm.tab20(np.linspace(0, 1, 20)))
additional_colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
colors += additional_colors
plt.figure(figsize=(10, 6))
fl = 14
for i in range(0, 23):
chr_id = i + 1
linkresult = list_linkages[i]
distances = linkresult[:, 2]
distances = np.arange(2, 0, -0.1)
num_clusters = []
for dist in distances:
clusters = fcluster(linkresult, dist, criterion='distance')
unique_clusters = np.unique(clusters)
num_clusters.append(len(unique_clusters))
if chr_id != 23:
plt.plot(distances, np.log10(num_clusters), '-o', label=f"Chr {chr_id}", color=colors[i])
else:
plt.plot(distances, np.log10(num_clusters), '-o', label="Chr X", color=colors[i])
plt.title('Number of Clusters vs. Distance', fontsize=20)
plt.xlabel('Distance', fontsize=fl)
plt.ylabel('Logarithm of the Number of Clusters', fontsize=fl)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc='lower left', bbox_to_anchor=(0.0, 0.0), ncol=4, fontsize = 9)
plt.grid(True)
plt.savefig("./results38/entropy_clusters_paper_plot_hg38.eps", format='eps')
#plt.show()
#producing FIGURE 7 in the paper
random.seed(2023)
labels_mark = list(merged_df['Target'])
labels_cell = list(merged_df['Biosample term name'])
labels_factor = list(merged_df['Factor'])
labels_activity = list(merged_df['Activity'])
colors = list(cm.tab20(np.linspace(0, 1, 20)))
additional_colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
colors += additional_colors
distances = np.arange(2, 0, -0.1)
# function to perform the plotting for a given set of labels
def plot_for_labels(labels, ax, title):
all_wa = []
labels_true_wa = []
for i in range(0, 23):
chr_id = i + 1
linkresult = list_linkages[i]
wa = []
true_wa = []
for dist in distances:
clusters = fcluster(linkresult, dist, criterion='distance')
true_wa.append(weighted_average_entropy(labels, clusters))
wa_values = []
for j in range(100):
random_labels = labels.copy()
random.shuffle(random_labels)
wa_values.append(weighted_average_entropy(random_labels, clusters))
wa.append(np.mean(wa_values))
all_wa.append(wa)
color = colors[i]
labels_true_wa.append(true_wa)
if chr_id != 23:
ax.plot(distances, true_wa, '-o', label=f"Chr {chr_id}", color=color)
else:
ax.plot(distances, true_wa, '-o', label="Chr X", color=color)
mean_wa_all = np.mean(all_wa, axis=0)
var_wa_all = np.var(all_wa, axis=0)
ax.errorbar(distances, mean_wa_all, yerr=np.sqrt(var_wa_all), fmt='-o', label="Random", color="black")
ax.set_title(title)
ax.set_xlabel('Distance: (1 - correlation coefficient)')
ax.set_ylabel('Weighted average of the normalized entropy')
ax.grid(True)
return all_wa, labels_true_wa
#fig, axes = plt.subplots(2, 2, figsize=(24, 24))
# plot_results = {}
# # Plot for each label type and store the results
# plot_results['Activity'] = plot_for_labels(labels_activity, axes[0, 0], 'Activity: Weighted average entropy vs. Distance')
# plot_results['Factor'] = plot_for_labels(labels_factor, axes[0, 1], 'Factor: Weighted average entropy vs. Distance')
# plot_results['Modification'] = plot_for_labels(labels_mark, axes[1, 0], 'Modification: Weighted average entropy vs. Distance')
# plot_results['Cell'] = plot_for_labels(labels_cell, axes[1, 1], 'Cell: Weighted average entropy vs. Distance')
# Create a common legend
# handles, labels = axes[0, 0].get_legend_handles_labels()
# fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, 0.0), ncol=8)
# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# plt.show()
# plt.savefig("entropy_paper_plot_hg38.eps", format='eps')
# with open('/Users/akim/Documents/epi/plot_results.json', 'w') as file:
# json.dump(plot_results, file)
#uncomment the block above to generate plot_results, can take several hours
#otherwise we load saved files with the results above
with open('./data/plot_results_38.json', 'r') as file:
plot_results = json.load(file)
# Plot for each label type and store the results
plot_results['Activity'] = plot_for_labels(labels_activity, axes[0, 0], 'Activity: Weighted average entropy vs. Distance')
plot_results['Factor'] = plot_for_labels(labels_factor, axes[0, 1], 'Factor: Weighted average entropy vs. Distance')
plot_results['Modification'] = plot_for_labels(labels_mark, axes[1, 0], 'Modifier: Weighted average entropy vs. Distance')
plot_results['Cell'] = plot_for_labels(labels_cell, axes[1, 1], 'Cell: Weighted average entropy vs. Distance')
# Define distances (assuming these are already defined, in our case from 0 to 2 for 1 - correlation)
colors = list(cm.tab20(np.linspace(0, 1, 20))) + [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
distances = np.arange(2, 0, -0.1)
fig, axes = plt.subplots(2, 2, figsize=(30, 30))
font_size = 22
for idx, ((label, (all_wa, labels_true_wa)), ax) in enumerate(zip(plot_results.items(), axes.flatten())):
for i, true_wa in enumerate(labels_true_wa):
ax.plot(distances, true_wa, '-o', color=colors[i])
# Plotting the random values with error bars (black curves)
mean_random_wa = np.mean(all_wa, axis=0)
std_random_wa = np.std(all_wa, axis=0)
ax.errorbar(distances, mean_random_wa, yerr=std_random_wa, fmt='-o', color="black")
ax.set_title(label, fontsize=30)
ax.grid(True)
if idx >= 2: # Bottom plots
ax.set_xlabel('Distance', fontsize=font_size)
if idx % 2 == 0: # Left plots
ax.set_ylabel('Entropy', fontsize=font_size)
ax.tick_params(axis='both', which='major', labelsize=20)
legend_handles = [mpatches.Patch(color=colors[i], label=f'Chr {i+1}' if i != 22 else 'Chr X') for i in range(23)]
legend_handles.append(mpatches.Patch(color="black", label="Random"))
fig.legend(handles=legend_handles, loc='lower center', bbox_to_anchor=(0.5, 0.0), ncol=12, fontsize = font_size)
plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.savefig("./results38/entropy_paper_plot_hg38.eps", format='eps')