I'm in the process of Data splitting and Cross Validation. For the data splitting, I need to extract ONLY the test dataset and leave the rest of the data as is for cross validation. And I'm geeting an error of ValueError: could not convert string to float: 'Curtis RIngraham Directge' at the end of Cross Validation. How should I fix it?
Data Splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# First extract our test data and store it in x_test, y_test
features = features_df.to_numpy()
labels = labels_df.to_numpy()
_x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.10, random_state=42)
# set k = 5
k = 5
kfold_spliiter = KFold(n_splits=k)
folds_data = [] # this is an inefficient way but still do it
fold = 1
for train_index, validation_index in kfold_spliiter.split(_x):
x_train , x_valid = _x[train_index,:],_x[validation_index,:]
y_train , y_valid = _y[train_index,:] , _y[validation_index,:]
print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
fold+=1
folds_data.append((x_train,y_train,x_valid,y_valid))
Cross Validation
best_validation_accuracy = 0
best_model_name = ""
best_model = None
# Iterate over all models
for model_name in all_models.keys():
print (f"Evaluating {model_name} ...")
model = all_models[model_name]
# Let's store training and validation accuracies for all folds
train_acc_for_all_folds = []
valid_acc_for_all_folds = []
#Iterate over all folds
for i, fold in enumerate(folds_data):
x_train, y_train, x_valid, y_valid = fold
# Train the model
_ = model.fit(x_train,y_train.flatten())
# Evluate model on training data
y_pred_train = model.predict(x_train)
# Evaluate the model on validation data
y_pred_valid = model.predict(x_valid)
# Compute training accuracy
train_acc = accuracy_score(y_pred_train , y_train)
# Store training accuracy for each folds
train_acc_for_all_folds.append(train_acc)
# Compute validation accuracy
valid_acc = accuracy_score(y_pred_valid , y_valid.flatten())
# Store validation accuracy for each folds
valid_acc_for_all_folds.append(valid_acc)
#average training accuracy across k folds
avg_training_acc = sum(train_acc_for_all_folds)/k
print (f"Average training accuracy for model {model_name} = {avg_training_acc}")
#average validation accuracy across k folds
avg_validation_acc = sum(valid_acc_for_all_folds)/k
print (f"Average validation accuracy for model {model_name} = {avg_validation_acc}")
# Select best model based on average validation accuracy
if avg_validation_acc > best_validation_accuracy:
best_validation_accuracy = avg_validation_acc
best_model_name = model_name
best_model = model
print (f"-----------------------------------")
print (f"Best model for the task is {best_model_name} which offers the validation accuracy of {best_validation_accuracy}")
Tried to find any remaining x_train, y_train, x_valid, and y_valid string values, but could not find any.
It can be because there is some Columns which have categorical data in your dataset. First you can convert them into numbers using method 1 or method 2: Method 1:
Method 2: you can convert the particular column Values into int