from scipy.cluster.hierarchy import dendrogram, linkage,fcluster

import numpy as np

import matplotlib.pyplot as plt

# data

np.random.seed(4711) # for repeatability of this tutorial

a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])

b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])

X = np.concatenate((a, b),)

plt.scatter(X[:,0], X[:,1])

# fit clusters

Z = linkage(X, method='ward', metric='euclidean', preserve_input=True)

# plot dendrogram

max_d = 50

clusters = fcluster(Z, max_d, criterion='distance')

# now if I have new data

a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[10,])

b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[5,])

X_test = np.concatenate((a, b),)

print(X_test.shape) # 150 samples with 2 dimensions

plt.scatter(X_test[:,0], X_test[:,1])

plt.show()

how to compute distances for the new data and assign clusters using clusters from training data?

code references:joernhees.de