-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
538 lines (427 loc) · 19.6 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
#!/usr/bin/env python3
"""
Module: clustering.py
Clustering for convolutional autoencoder :
* Iterate through the whole dataset(both training and validation) passing them
through only encoder to get bottleneck vectors for each data point.
* Cluster these bottleneck vectors using unsupervised clustering methods like
Kmeans or GaussianMixture modeling.
Online resources we find useful during implementation
Methods for selecting the optimal number of clusters: https://towardsdatascience.com/cheat-sheet-to-implementing-7-methods-for-selecting-optimal-number-of-clusters-in-python-898241e1d6ad
Gap statistics intro: https://glowingpython.blogspot.com/2019/01/a-visual-introduction-to-gap-statistics.html
KMeans, elbow and Silhouette: https://realpython.com/k-means-clustering-python/
Authors: Krupal, Rachael
"""
import os
from pathlib import Path
from math import ceil, floor
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
#local system hangs while processing GMM so commented out for now
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from kneed import KneeLocator # to identify the elbow point programmatically
import argparse
import torch
import torch.nn as nn
from torchvision.utils import save_image, make_grid
from models.residual_encoder import DefaultEncoderOpts
from models.residual_encoder import ResidualEncoder as Encoder
from models.residual_decoder import ResidualDecoder as Decoder
#from models.residual_encoder import DefaultEncoderOpts, ResidualEncoder as Encoder
from collections import OrderedDict
from data.audiodataset import DefaultSpecDatasetOps, StridedAudioDataset, get_audio_files_from_dir, Dataset
from utils.logging import Logger
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--debug",
dest="debug",
action="store_true",
help="Log additional training and model information.",
)
""" Directory parameters """
parser.add_argument(
"--model_path",
type=str,
default=None,
help="Path to a trained model.",
)
parser.add_argument(
"--log_dir", type=str, default=None, help="The directory to store the logs."
)
parser.add_argument(
"--clustering_dir",
type=str,
help="The directory where the clustering outputs will be stored.",
)
parser.add_argument(
"--data_dir", # '/net/projects/scratch/winter/valid_until_31_July_2021/0-animal-communication/data_grid/Chimp_IvoryCoast/xxx'
type=str,
help="The path to the dataset directory.",
)
parser.add_argument(
"--decod_dir",
type=str,
default=None,
help="The directory to store the regenerated/decoded spectrograms.",
)
""" Clustering parameters """
parser.add_argument(
"--calc_optimal_num_clusters", type=str, default=None, help="use elbow or gap statistics to find the optimal number of clusters",
)
parser.add_argument(
"--max_clusters", type=int, default=None, help="The max number of clusters to try for calculating optimal number of clusters."
)
parser.add_argument(
"--num_clusters", type=int, default=None, help="The number of clusters to cluster all the data points."
)
parser.add_argument(
"--clustering_algorithm",
type=str,
default=None,
help="Use either KMeans or GMM to cluster the embeddings",
)
""" Data input parameters """
parser.add_argument(
"--sequence_len", type=float, default=2, help="Sequence length in [s]."
)
parser.add_argument(
"--hop", type=float, default=1, help="Hop [s] of subsequent sequences."
)
parser.add_argument(
"--batch_size", type=int, default=1, help="The number of images per batch."
)
parser.add_argument(
"--num_workers", type=int, default=4, help="Number of workers used in data-loading"
)
parser.add_argument(
"--no_cuda",
dest="cuda",
action="store_false",
help="Do not use cuda to train model.",
)
ARGS = parser.parse_args()
ARGS.cuda = torch.cuda.is_available() and ARGS.cuda # check if both hardware and user's intention of using cuda
ARGS.device = torch.device("cuda") if ARGS.cuda else torch.device("cpu")
log = Logger("Clustering", ARGS.debug, ARGS.log_dir)
kmeans_kwargs = {
"init": "k-means++",
"n_init": 10, # sets the number of initializations to perform
"max_iter": 300,
"random_state": 42,
}
"""
Get audio all audio files from the given data directory except they are broken.
discard the can_load_from_csv() of orcaspot
"""
def get_audio_files():
audio_files = list(get_audio_files_from_dir(ARGS.data_dir))
log.info("Found {} audio files for training.".format(len(audio_files)))
if len(audio_files) == 0:
log.close()
exit(1)
return audio_files
def save_decod_spec(spec, epoch):
#if not os.path.isdir(ARGS.decod_dir):
# os.makedirs(ARGS.decod_dir, exist_ok=True)
spec = spec.view(spec.size(0), 1, 128, 256)
save_image(spec, os.path.join(ARGS.decod_dir, 'reconstructed_epoch_{}.png'.format(epoch)))
def kmeans_optimalK(data, max_clusters=30):
kmeans = [KMeans(
n_clusters=i,
**kmeans_kwargs,
) for i in range(1, max_clusters+1)]
print("kmeans is ", kmeans)
scores = [kmeans[i].fit(data).inertia_ for i in range(len(kmeans))]
print("scores are ", scores)
#x = range(1, max_clusters+1)
#calculation = np.abs(np.diff(x).mean())
#print(x)
#print(calculation)
#suggested_elbow = get_elbow(np.arange(1, max_clusters+1), scores)
kl = KneeLocator(np.arange(1, max_clusters+1), scores)
suggested_elbow = kl.elbow
return scores, suggested_elbow
def get_elbow(max_clusters, scores, curve="convex", direction='decreasing'): #concave
"""Helper function to identify the elbow point programmatically"""
kl = KneeLocator(
max_clusters, scores, curve, direction
)
return kl.elbow
def elbow_plot(max_clusters, scores, path):
plt.plot(np.arange(1, max_clusters+1), scores, linestyle='--', marker='o', color='b')
plt.xlabel('Number of clusters')
plt.ylabel('Score (negative of the K-means objective)')
plt.title('Elbow curve with max_clusters={}'.format(max_clusters))
plt.savefig(os.path.join(path, 'ElbowCurve.png'))
log.info("The elbow plot is saved under directory {}".format(path))
def gap_optimalK(data, num_refs=4, max_clusters=15):
"""
Calculate kMeans optimal number of clusters using Gap statistics
adapted from
https://datasciencelab.wordpress.com/tag/gap-statistic/
https://towardsdatascience.com/cheat-sheet-to-implementing-7-methods-for-selecting-optimal-number-of-clusters-in-python-898241e1d6ad
and https://glowingpython.blogspot.com/2019/01/a-visual-introduction-to-gap-statistics.html
:param data: (n_samples, n_features)
:param num_refs: number of sample reference datasets to create
:param maxClusters: Maximum number of clusters to test for
:return: (gaps, optimalK)
"""
#if len(data.shape) == 1:
#data = data.reshape(-1, 1)
gaps = np.zeros((len(range(1, max_clusters+1)),))
resultsdf = pd.DataFrame({'clusterCount': [], 'gap': []})
for gap_index, k in enumerate(range(1, max_clusters+1)):
# Holder for reference dispersion results
ref_disps = np.zeros(num_refs)
# For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
for i in range(num_refs):
# Create new random reference set
random_reference = np.random.random_sample(size=data.shape)
# Fit to it
km = KMeans(n_clusters=k, **kmeans_kwargs)
km.fit(random_reference)
ref_disp = km.inertia_
ref_disps[i] = ref_disp
# Fit cluster to original data and create dispersion
km = KMeans(n_clusters=k, **kmeans_kwargs)
km.fit(data)
orig_disp = km.inertia_
# Calculate gap statistic
# online resources use: gap = np.log(np.mean(refDisps)) - np.log(origDisp). But it is believed to be wrong
# because of the equation 3 in the original paper
gap = np.mean(np.log(ref_disps)) - np.log(orig_disp)
# Assign this loop's gap statistic to gaps
gaps[gap_index] = gap
resultsdf = resultsdf.append({'clusterCount': k, 'gap': gap}, ignore_index=True)
return (gaps.argmax() + 1, resultsdf)
def gap_plot(df, path):
print('clusterCount is ', df['clusterCount'])
plt.plot(df['clusterCount'], df['gap'], linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('Gap Statistic');
plt.title('Gap Statistic vs. K');
plt.savefig(os.path.join(path, 'GapStat.png'));
log.info("The gap plot is saved under directory {}".format(path))
if __name__ == '__main__':
audio_files = get_audio_files()
#Path(ARGS.decod_dir).mkdir(parents=True, exist_ok=True)
if ARGS.decod_dir is not None and os.path.isdir(ARGS.decod_dir):
os.makedirs(ARGS.decod_dir, exist_ok=True)
# load the trained model
if ARGS.model_path is not None:
model_dict = torch.load(ARGS.model_path)
encoder = Encoder(model_dict["encoderOpts"]).to(ARGS.device)
encoder.load_state_dict(model_dict["encoderState"])
decoder = Decoder(model_dict["decoderOpts"]).to(ARGS.device)
decoder.load_state_dict(model_dict["decoderState"])
#model = encoder
model = nn.Sequential(
OrderedDict([("encoder", encoder), ("decoder", decoder)])
)
dataOpts = model_dict["dataOpts"]
log.info(model)
if torch.cuda.is_available() and ARGS.cuda:
model = model.cuda()
model.eval()
sr = dataOpts["sr"] # modified, s.t. not hard-coded
hop_length = dataOpts["hop_length"]
n_fft = dataOpts["n_fft"]
try:
n_freq_bins = dataOpts["num_mels"]
except KeyError:
n_freq_bins = dataOpts["n_freq_bins"]
freq_compression = dataOpts["freq_compression"] # added, missing in orig master; is freq compression not needed during inference?
fmin = dataOpts["fmin"]
fmax = dataOpts["fmax"]
log.debug("dataOpts: " + str(dataOpts))
#sequence_len = int(ceil(ARGS.sequence_len * sr))
sequence_len = int(
float(ARGS.sequence_len) / 1000 * dataOpts["sr"] / dataOpts["hop_length"]
)
hop = int(ceil(ARGS.hop * sr))
log.info("Predicting {} files".format(len(audio_files)))
#for file_name in audio_files:
# log.info(file_name)
# dataset = StridedAudioDataset(
# os.path.join(ARGS.data_dir, file_name.strip()),
# sequence_len=sequence_len,
# hop=hop,
# sr=sr,
# fft_size=n_fft,
# fft_hop=hop_length,
# n_freq_bins=n_freq_bins,
# freq_compression=freq_compression, # added
# f_min=fmin,
# f_max=fmax,
# )
# data_loader = torch.utils.data.DataLoader(
# dataset,
# batch_size=ARGS.batch_size,
# num_workers=ARGS.num_workers,
# pin_memory=True,
# )
# log.info("size of the file(samples)={}".format(dataset.n_frames))
# log.info("size of hop(samples)={}".format(hop))
# stop = int(max(floor(dataset.n_frames / hop), 1))
# log.info("stop time={}".format(stop))
dataset = Dataset(
file_names=audio_files,
working_dir=ARGS.data_dir,
#cache_dir=ARGS.cache_dir,
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
n_freq_bins=n_freq_bins,
freq_compression=freq_compression,
f_min=fmin,
f_max=fmax,
seq_len=sequence_len,
augmentation=None,
noise_files=None,
)
data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=ARGS.batch_size, # default: 1
num_workers=ARGS.num_workers,
pin_memory=True,
)
#log.info("size of the file(samples)={}".format(dataset.n_frames))
#log.info("size of hop(samples)={}".format(hop))
#stop = int(max(floor(dataset.n_frames / hop), 1))
#log.info("stop time={}".format(stop))
# collect embeddings for all data points
file_names = []
bottleneck_outputs = []
with torch.no_grad():
for i, (input_specs, label) in enumerate(data_loader):
#print(input_specs)
# remove file path to have only file name, ex : ['path/to/directory/file_1.wav']
file_name = str(label['file_name'])[::-1] # reverse string
file_name = file_name.split("/")[0]
file_name = file_name[::-1].split("'")[0] # align it back to the right order
file_names.append(file_name)
# log.info("File-name : {}".format(file_name))
input_specs = input_specs.to(device=ARGS.device)
bottleneck_output = model.encoder(input_specs)
bottleneck_output = bottleneck_output.cpu()
#print("bottleneck_output .shape : ", bottleneck_output.shape)
#print("bottleneck_output[0].shape :", bottleneck_output[0].shape)
# uncomment if wanting to visualize bottleneck features
#img = bottleneck_output[0]
#save_image(img, os.path.join(ARGS.decod_dir, 'encoder_bottleneck_{}.png'.format(i)))
bottleneck_output = np.reshape(bottleneck_output.detach().numpy(), newshape=(-1))
# bottleneck_output = torch.flatten(bottleneck_output)
# bottleneck_output = bottleneck_output.detach().numpy()
bottleneck_outputs.append(bottleneck_output)
log.debug("Finished extract code")
log.debug(kmeans_kwargs)
# calculate the optimal number of clusters using elbow or gap statistics
if ARGS.calc_optimal_num_clusters is not None:
if ARGS.calc_optimal_num_clusters == 'elbow':
""" Finding optimal number of clusters, by using elbow curve (below are default values)"""
scores, suggested_elbow = kmeans_optimalK(bottleneck_outputs, max_clusters=ARGS.max_clusters)
log.info("The suggested optimal K is {}".format(suggested_elbow))
elbow_plot(ARGS.max_clusters, scores, ARGS.clustering_dir)
elif ARGS.calc_optimal_num_clusters == 'gap':
score_g, df = gap_optimalK(np.array(bottleneck_outputs), num_refs=5, max_clusters=ARGS.max_clusters)
gap_plot(df, ARGS.clustering_dir)
log.info("The suggested optimal K is {}".format(score_g))
# if you already know how many clusters you would like to have,
# you could directly train the clustering model and cluster the embeddings
if ARGS.num_clusters is not None and ARGS.clustering_algorithm is not None:
if ARGS.clustering_algorithm == "kmeans":
kmeans = KMeans(n_clusters=ARGS.num_clusters, **kmeans_kwargs)
pred_kmeans = kmeans.fit_predict(bottleneck_outputs)
log.info("predictions : {}".format(pred_kmeans))
print("Cluster centers of Kmeans : ", kmeans.cluster_centers_)
df = pd.DataFrame(columns=["filename"] + ["cluster_number"])
# print file names with respective cluster numbers
for i in range(len(data_loader)):
log.info("file name : {}, predicted cluster - Kmeans : {}".format(file_names[i], pred_kmeans[i]))
# log.info("file name : {}, predicted cluster - GaussianMixture : {}".format(file_names[i], pred_gm[i]))
df = df.append(dict(zip(df.columns, [file_names[i]] + [pred_kmeans[i]])), ignore_index=True)
if ARGS.clustering_dir is not None:
if Path(ARGS.clustering_dir).exists() == False:
os.mkdir(ARGS.clustering_dir)
df.to_csv(ARGS.clustering_dir + "/Kmeans_clusters.csv")
log.info("kmeans_clusters csv is saved under directory {}".format(ARGS.clustering_dir))
elif ARGS.clustering_algorithm == "gmm":
gm = GaussianMixture(n_components=2, random_state=0)
pred_gm = gm.fit_predict(bottleneck_outputs)
log.info("predictions : {}".format(pred_gm))
#print("Cluster centers of GaussianMixture : {:.8f}".format(gm.means_))
df = pd.DataFrame(columns=["filename"] + ["cluster_number"])
# print file names with respective cluster numbers
for i in range(len(data_loader)):
log.info("file name : {}, predicted cluster - GaussianMixture : {}".format(file_names[i], pred_gm[i]))
df = df.append(dict(zip(df.columns, [file_names[i]] + [pred_gm[i]])), ignore_index=True)
if ARGS.clustering_dir is not None:
if Path(ARGS.clustering_dir).exists() == False:
os.mkdir(ARGS.clustering_dir)
df.to_csv(ARGS.clustering_dir + "/gmm_clusters.csv")
log.info("gmm_clusters csv is saved under directory {}".format(ARGS.clustering_dir))
else:
log.error("Pls choose a clustering algorithm - kmeans or gmm (in a case sensitive manner)")
#summary_dir = ARGS.clustering_dir
#if summary_dir is not None:
#df.to_csv(summary_dir + "/Kmeans_clusters")
#print("km.cluster_centers_ length :", len(km.cluster_centers_))
"""
if ARGS.decod_dir is not None:
bottleneck_output = 0
with torch.no_grad():
for i in range(len(bottleneck_outputs)):
bottleneck_output = torch.tensor(bottleneck_outputs[i]).to(ARGS.device)
#print("Krupal : bottleneck_output shape :", bottleneck_output.shape)
bottleneck_output = torch.reshape(bottleneck_output, (-1, 4, 4, 8)) #512
#print("bottleneck_output .shape : ", bottleneck_output.shape)
#print("bottleneck_output[0].shape :", bottleneck_output[0].shape)
img = bottleneck_output[0]
save_image(img, os.path.join(ARGS.decod_dir, 'decoder_bottleneck_{}.png'.format(i)))
# bottleneck_output = torch.reshape(bottleneck_output, (1, 512, 4, 8))
# not valid bottleneck_output = torch.unflatten(bottleneck_output, (1, 512, 4, 8))
# bottleneck_output = bottleneck_output.unflatten(-1, (512, 4, 8))
regenerate_spec = model.decoder(bottleneck_output)
# Krupal :
#print("Decoder's regenerated spec from embeddings' shape :", regenerate_spec.shape)
epoch = "regenerated_spec_" + str(i)
save_decod_spec(regenerate_spec.cpu().data, epoch) # change "cpu' to device
log.close()
"""
# -------------------------- leftover from Lucidmonkey --------------------------
# extract features
#df = pd.DataFrame( columns = ["filename"] + ["bottleneck" + str(_) for _ in range(nbottleneck)] )
#for spectrogram in data:
#imagename = spectrogram["name"]
#spectrogram = spectrogram["spectrogram"]
# if the spectrogram is not of width 194 units, don't run the iteration
#if spectrogram.shape[1] != 194:
#continue
#norm = np.linalg.norm(spectrogram)
#snippet = spectrogram / norm
#snippet = torch.reshape( snippet, (-1,) ).to(device)
#features = model.generatefeatures(snippet).detach().numpy()
#features = [str(f.item()) for f in features]
#df = df.append( dict( zip( df.columns, [imagename] + list(features) ) ), ignore_index=True )
#df.to_csv(folder + "/features")
# perform clustering
#clustering = GaussianMixture(n_components=7, random_state=0).fit(df[[ "bottleneck" + str(_) for _ in range(nbottleneck) ]])
#df["clusterlabels"] = clustering.means_
#fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
#r = [-1,1]
#X, Y = np.meshgrid(r, r)
#ax.scatter3D(points[:, 0], points[:, 1], points[:, 2])
#ax.set_xlabel('X')
#ax.set_ylabel('Y')
#ax.set_zlabel('Z')
#plt.savefig(folder + '/gmmmixtures')
# write to file
#df.to_csv(folder + "/features")
#sys.stdout.close()
#sys.stderr.close()