I am trying to classify data points from this dataset: https://sharon.srworkspace.com/ml/datasets/hw1/wine.data.csv. I am utilizing a Gaussian Bayes and Gaussian Naive Bayes classifiers almost from scratch in Python. So, after the train test split for the model, I implemented these functions to classify the data points:
import numpy as np
from scipy.stats import multivariate_normal
def classify_point_gaussian_bayes(x):
classes = np.unique(y)
likelihoods = []
for c in classes:
class_data = data[y == c]
prior = len(class_data) / len(data)
mean = np.mean(class_data, axis=0)
cov = np.cov(class_data.T)
likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=cov, allow_singular=True)
likelihoods.append(prior * likelihood)
return classes[np.argmax(likelihoods)]
def classify_point_gaussian_naive_bayes(x):
classes = np.unique(y)
likelihoods = []
for c in classes:
class_data = data[y == c]
prior = len(class_data) / len(data)
mean = np.mean(class_data, axis=0)
var = np.var(class_data, axis=0)
likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=np.diag(var), allow_singular=True)
likelihoods.append(prior * likelihood)
return classes[np.argmax(likelihoods)]
And then I have to look at the test accuracies for both methods, which I did in this form:
res = []
for idx, test_point in enumerate(X_test.values):
res.append(classify_point_gaussian_bayes(test_point) == y_test[idx])
print(f'Test accuracy for gaussian bayes is {res.count(True)/len(res)}')
res = []
for idx, test_point in enumerate(X_test.values):
res.append(classify_point_gaussian_naive_bayes(test_point) == y_test[idx])
print(f'Test accuracy for gaussian naive bayes is {res.count(True)/len(res)}')
But I continue to have the same error: ValueError: operands could not be broadcast together with shapes (1,13) (14,).
More specifically:
ValueError Traceback (most recent call last)
Cell In[42], line 3
1 res = []
2 for idx, test_point in enumerate(X_test.values):
----> 3 res.append(classify_point_gaussian_bayes(test_point) == y_test[idx])
4 print(f'Test accuracy for gaussian bayes is {res.count(True)/len(res)}')
6 res = []
Cell In[41], line 21
18 # Reshape x to have the same number of features as the mean
19 x_reshaped = x.reshape(1, -1)
---> 21 likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=cov, allow_singular=True)
22 likelihoods.append(prior * likelihood)
24 return classes[np.argmax(likelihoods)]
File c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_multivariate.py:583, in multivariate_normal_gen.pdf(self, x, mean, cov, allow_singular)
581 dim, mean, cov_object = params
582 x = self._process_quantiles(x, dim)
--> 583 out = np.exp(self._logpdf(x, mean, cov_object))
584 if np.any((cov_object.rank < dim)):
585 out_of_bounds = ~cov_object._support_mask(x-mean)
File c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_multivariate.py:526, in multivariate_normal_gen._logpdf(self, x, mean, cov_object)
507 """Log of the multivariate normal probability density function.
508
509 Parameters
(...)
523
524 """
525 log_det_cov, rank = cov_object.log_pdet, cov_object.rank
--> 526 dev = x - mean
527 if dev.ndim > 1:
528 log_det_cov = log_det_cov[..., np.newaxis]
ValueError: operands could not be broadcast together with shapes (1,13) (14,)
Since it is a problem about dimensions, I tried to resize the x data points that the functions take as argument with this line in both functions: x_reshaped = x.reshape(1, -1) and even: x_reshaped = x.reshape(-1).
But it did not work and still gives me the same error as above.