My idea is to calculate the intra-cluster distance of the current embeddings and other embeddings in every clusters, the cluster with the largest similarity is considered to be the nearest_cluster_idx and the distance is the nearest_cluster_avg_distance, I need help in cChoosing the perfect threshold value
Many Thanks :)
def cluster_sentences(self):
base_threshold = -0.25
threshold_factor = -0.25
clusters = []
for i, embedding in enumerate(self.embeddings):
if not clusters:
clusters.append([i])
continue
min_distance = float('inf')
nearest_cluster_idx = None
nearest_cluster_avg_distance = None
for cluster_idx, cluster in enumerate(clusters):
centroid_indices = np.array(cluster)
distance = 1 - self.cosine_similarity_standard[i, centroid_indices].mean()
if distance < min_distance:
min_distance = distance
nearest_cluster_idx = cluster_idx
nearest_cluster_avg_distance = np.mean(
self.cosine_similarity_standard[centroid_indices, centroid_indices])
adaptive_threshold = base_threshold + threshold_factor * nearest_cluster_avg_distance
if min_distance < adaptive_threshold:
clusters[nearest_cluster_idx].append(i)
else:
clusters.append([i])
return clusters