I'm builidng a k_means algorithm from scratch without the use of ML libraries. The first step of the k_means algorithm is to initialize k random centroids by assigning k random labels to each data point. How do I assign random cluster labels to each entry in a dataset?
DEFAULT_CENTROIDS = np.array([[5.664705882352942, 3.0352941176470587, 3.3352941176470585, 1.0176470588235293],
[5.446153846153847, 3.2538461538461543, 2.9538461538461536, 0.8846153846153846],
[5.906666666666667, 2.933333333333333, 4.1000000000000005, 1.3866666666666667],
[5.992307692307692, 3.0230769230769234, 4.076923076923077, 1.3461538461538463],
[5.747619047619048, 3.0714285714285716, 3.6238095238095243, 1.1380952380952383],
[6.161538461538462, 3.030769230769231, 4.484615384615385, 1.5307692307692309],
[6.294117647058823, 2.9764705882352938, 4.494117647058823, 1.4],
[5.853846153846154, 3.215384615384615, 3.730769230769231, 1.2076923076923078],
[5.52857142857143, 3.142857142857143, 3.107142857142857, 1.007142857142857],
[5.828571428571429, 2.9357142857142855, 3.664285714285714, 1.1]])
def get_closest(data_point: np.ndarray, centroids: np.ndarray):
"""
Takes a data_point and a nd.array of multiple centroids and returns the index of the centroid closest to data_point
by computing the euclidean distance for each centroid and picking the closest.
"""
N = centroids.shape[0]
dist = np.empty(N)
for i, c in enumerate(centroids):
dist[i] = np.linalg.norm(c - data_point)
index_min = np.argmin(dist)
return index_min
def k_means(data: np.ndarray, k:int=3, n_iter:int=500, random_initialization=False) -> Tuple[np.ndarray, int]:
"""
:param data: your data, a numpy array with shape (n_entries, n_features)
:param k: The number of clusters to compute
:param n_iter: The maximal numnber of iterations
:param random_initialization: If False, DEFAULT_CENTROIDS are used as the centroids of the first iteration.
:return: A tuple (cluster_indices: A numpy array of cluster_indices,
n_iterations: the number of iterations it took until the algorithm terminated)
"""
# Initialize the algorithm by assigning random cluster labels to each entry in your dataset
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
labels = np.array([np.argmin([(el - c) ** 2 for c in centroids]) for el in data])
clustering = []
for k in range(k):
clustering.append(data[labels == k])
# Implement K-Means with a while loop, which terminates either if the centroids don't move anymore, or
# if the number of iterations exceeds n_iter
counter = 0
while counter < n_iter:
# Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
# if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
# Update the cluster labels using get_closest
# if the centroids didn't move, exit the while loop
pass
# return the final cluster labels and the number of iterations it took
return clustering, counter
Essentially, what does clustering
get set to without additional libraries? I tried using clustering = (DEFAULT_CENTROIDS.sample(n=3))
but that obviously is not a solution.
EDIT1:
while counter < n_iter:
# Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
# if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
if random_initialization is False and counter == 0:
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
# Update the cluster labels using get_closest
labels = np.array([get_closest(el, centroids) for el in data_np])
clustering = []
for i in range(k):
clustering.append(np.where(labels == i)[0])
counter = 1
new_centroids = np.zeros_like(centroids)
for i in range(k):
if len(clustering[i]) > 0:
new_centroids[i] = data_np[clustering[i]].mean(axis=0)
else:
new_centroids[i] = (centroids[i])
# if the centroids didn't move, exit the while loo
if clustering is not None and (centroids != new_centroids).sum() == 0:
break
else:
centroids = new_centroids
pass
# return the final cluster labels and the number of iterations it took
return clustering, counter
CodePudding user response:
At first, I'd create a bi-dimensional array, which will contain the clusters computed at every iteration. You also need to maintain another array that contains the centroid.
# Select k centroids from the default one
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
# Select all the point nearest to every centroid
labels = np.array([np.argmin([np.sum((el - c) ** 2) for c in centroids]) for el in data])
# Create a list, which will contains k sub-list, defining a cluster for every centroid
clustering = []
for i in range(k):
clustering.append(data[labels == i])
Now you need to update the centroids, and repeat this set of operations until you reach your criteria.
EDIT:
We can adapt this to your code using something like:
n_iter = 500
random_initialization = True
counter = 0
clustering = None
while counter < n_iter:
# Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
# if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
if random_initialization is False and counter == 0:
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
# Update the cluster labels using get_closest
labels = np.array([get_closest(el, centroids) for el in data])
clustering = []
for i in range(k):
clustering.append(np.where(labels == i)[0])
counter = 1
new_centroids = np.zeros_like(centroids)
for i in range(k):
if len(clustering[i]) > 0:
new_centroids[i] = data[clustering[i]].mean(axis=0)
else:
new_centroids[i] = (centroids[i])
# if the centroids didn't move, exit the while loop
if clustering is not None and (centroids == new_centroids).sum() == 0:
break
else:
centroids = new_centroids