I implemented a PCA class like this:
import numpy as np
import matplotlib.pyplot as plt
class PCA:
def __init__(self, n_components):
print('****** Created from-scratch PCA object *****')
self.n_components = n_components
self.components = None
self.mean = None
self.eigenvectors = None
self.eigenvalues = None
self.variance_ratio = None
self.cumulative_var = None
def fit(self, X):
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
covariance = np.cov(X_centered.T)
self.eigenvalues, self.eigenvectors = np.linalg.eig(covariance)
eigenvalue_idxs = np.argsort(self.eigenvalues)[::-1]
self.eigenvalues = self.eigenvalues[eigenvalue_idxs]
self.eigenvectors = self.eigenvectors[:, eigenvalue_idxs]
self.components = self.eigenvectors[:, :self.n_components]
self.variance_ratio = self.eigenvalues / np.sum(self.eigenvalues)
self.cumulative_var = np.cumsum(self.variance_ratio)
def transform(self, X):
X_centered = X - self.mean
projected_data = np.dot(X_centered, self.components)
return projected_data
And this is my code utilizing the class, but it won't pass the doctests:
#Step 1. import libs and dataset
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
breast_cancer = load_breast_cancer()
print(breast_cancer.feature_names)
print(len(breast_cancer.feature_names))
print(breast_cancer.target)
print(breast_cancer.target_names)
print(np.array(np.unique(breast_cancer.target, return_counts=True)))
df = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
df['diagnosis'] = breast_cancer.target
df.head()
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
random_state = 12
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12, shuffle=True)
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(X_train, y_train)
print('Logistic regression model score on original UNSCALED dataset (all features):', log_reg.score(X_test,y_test))
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = random_state, shuffle = True)
log_reg_scaled = LogisticRegression(max_iter=5000)
log_reg_scaled.fit(X_train_scaled, y_train)
print('Logistic regression model score on original SCALED dataset (all features):', log_reg_scaled.score(X_test_scaled,y_test))
components = X.shape[1]
pca_all = PCA(n_components = components)
pca_all.fit(X_scaled)
X_pca_all = pca_all.transform(X_scaled)
print('Principal components:', pca_all.components)
print('Variance ratios:', pca_all.variance_ratio)
print('Cumulative Variances:', pca_all.cumulative_var*100)
print('Number of components in scree plot:', pca_all.n_components)
PC_components = np.arange(1, pca_all.components.shape[1]+1)
sns.set(style='whitegrid', font_scale=1.2)
plt.subplots(figsize=(20, 7))
sns.barplot(x=PC_components, y=pca_all.variance_ratio, color='b')
sns.lineplot(x=PC_components-1, y=pca_all.cumulative_var, color='black', linestyle='-', linewidth=2, marker='o', markersize=8)
plt.title('Scree Plot')
plt.xlabel('N-th Principal Component')
plt.ylabel('Variance Explained')
plt.ylim(0, 1)
plt.show()
X_train_pca_all, X_test_pca_all, y_train_pca_all, y_test_pca_all = train_test_split(X_pca_all, y, test_size=0.3, random_state=random_state, shuffle=True)
log_reg_pca_all = LogisticRegression(max_iter=5000)
log_reg_pca_all.fit(X_train_pca_all, y_train_pca_all)
print('Logistic regression model score on transformed dataset (all PCs):', log_reg_pca_all.score(X_test_pca_all, y_test_pca_all))
pca_4 = PCA(n_components=4)
pca_4.fit(X_scaled)
X_pca_4 = pca_4.transform(X_scaled)
X_train_pca_4, X_test_pca_4, y_train_pca_4, y_test_pca_4 = train_test_split(X_pca_all, y, test_size=0.3, random_state=random_state, shuffle=True)
log_reg_pca_4 = LogisticRegression(max_iter=5000)
log_reg_pca_4.fit(X_train_pca_4, y_train_pca_4)
print('Logistic regression model score on transformed dataset (keep 4 PCs):', log_reg_pca_4.score(X_test_pca_4,y_test_pca_4))
#Run the doctest module. DO NOT modify any code below this line!
import doctest
"""
>>> print(X_pca_all.shape[1])
30
>>> print(np.round(X_pca_all[0][17], 3))
0.55
>>> print(np.round(log_reg_pca_all.score(X_test_pca_all,y_test_pca_all), 3))
0.971
>>> print(X_pca_4.shape[1])
4
>>> print(np.round(X_pca_4[29][3], 3))
1.911
>>> print(np.round(log_reg_pca_4.score(X_test_pca_4,y_test_pca_4), 3))
0.965
"""
doctest.testmod()
The result when I run shows that 4/6 doctests pass. The only doctests that fail are the ones with the score of the logistic regression model. What should I proceed in order to get the expected doctest result.
**********************************************************************
File "__main__", line 7, in __main__
Failed example:
print(np.round(log_reg_pca_all.score(X_test_pca_all,y_test_pca_all), 3))
Expected:
0.971
Got:
0.982
**********************************************************************
File "__main__", line 13, in __main__
Failed example:
print(np.round(log_reg_pca_4.score(X_test_pca_4,y_test_pca_4), 3))
Expected:
0.965
Got:
0.982
**********************************************************************
1 items had failures:
2 of 6 in __main__
***Test Failed*** 2 failures.
TestResults(failed=2, attempted=6)