Hej!
my first time working with k-means/tf-idf/document cluster. I cluster text files with k-means/tf-idf which works great. I plot (PCA) and can see the clusters nicely.
But now I want the authors from the texts as color indicator, not the cluster/topic. Does anyone know how to do it?
file_list = glob.glob(os.path.join(os.getcwd(), "myFiles", "*.txt"))
dataset = []
for file_path in file_list:
with open(file_path) as f_input:
dataset.append(f_input.read())
vectorizer = TfidfVectorizer(stop_words='english')
vectorized_documents = vectorizer.fit_transform(dataset)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(vectorized_documents.toarray())
num_clusters = 7
kmeans = KMeans(n_clusters=num_clusters, n_init=5,
max_iter=500, random_state=42)
kmeans.fit(vectorized_documents)
# create a dataframe to store the results
results = pd.DataFrame()
results['document'] = dataset
results['cluster'] = kmeans.labels_
# plot the results
colors = ['black', 'red', 'green', 'yellow', 'blue', 'orange', 'purple']
cluster = ['0', '1','2', '3', '4', '5', '6']
for i in range(num_clusters):
plt.scatter(reduced_data[kmeans.labels_ == i, 0],
reduced_data[kmeans.labels_ == i, 1],
s=10, color=colors[i],
label=f' {cluster[i]}')
plt.legend()
plt.show()

