Creating an ensemble of classifiers based on predefined feature subsets

169 Views Asked by At

The following MWE creates an ensemble method from the features selected using SelectKBest algorithm and RandomForest classifier.

# required import
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline

# ensemble created from features selected
def get_ensemble(n_features):
  # define base models
  models = []
  # enumerate the features in the training dataset
  for i in range(1, n_features + 1):
    # feature selection transform
    fs = SelectKBest(score_func=f_classif, k=i)
    # create the model
    model = RandomForestClassifier(n_estimators=50)
    # create the pipeline
    pipe = Pipeline([('fs', fs), ('m', model)])
    # list of tuple of models for voting
    models.append((str(i), pipe))

  # define the voting ensemble
  ensemble_clf = VotingClassifier(estimators=models, voting='hard')

  return ensemble_clf

So, to use the ensemble model:

# generate data for a 3-class classification
X, y = make_classification(n_samples=1000, n_features=10, n_classes=3,
                             n_informative=3)

X = pd.DataFrame(X, columns=list('ABCDEFGHIJ'))

X_train, X_test, y_train, y_test = train_test_split(X, y,
         test_size=0.3, random_state=42)

X_train.head()
       A       B       C       D       E       F       G       H       I      J
541  0.1756 -0.3772 -1.6396 -0.7524  0.2138  0.3113 -1.4906 -0.2885  0.1226  0.2057
440 -0.4381 -0.3302  0.7514 -0.4684 -1.2477 -0.5081 -0.7934 -0.3138  0.8423 -0.4038
482 -0.6648  1.2337 -0.2878 -1.6737 -1.2377 -0.4479 -1.1843 -0.2424 -0.9935 -1.4537
422  0.6099  0.2475  0.9612 -0.7339  0.6926 -1.5761 -1.6061 -0.3879 -0.1895  1.3738
778 -1.4893  0.5234  1.6126  0.8704 -2.7363 -1.3818 -0.2196 -0.7894 -1.1755 -2.8779

# get the ensemble model
ensemble_clssifier = get_ensemble(X_train.shape[1])

ensemble_clssifier.fit(X_train, y_train)

Creates 10 base models (n_features=10) and then an ensemble VotingClassifier based on majority (voting = hard).

Question:

The MWE described above works fine. However, I would like to replace the SelectKBest feature selection process in the get_ensemble function.

I have conducted a different feature selection process, and discovered the "optimal" feature subset for each class in this dataset as follows:

             | best predictors
-------------+-------------------
   class 0   |  A, B, C
   class 1   |  D, E, F, G
   class 2   |  G, H, I, J
-------------+-------------------

So the modification I would like to make to get_ensemble is that, instead of iterating over the number of available features, creating n base-models, it should create 3 (no. of classes) base models, where:

  • base-model 1 will be fitted using the feature subset ['A', 'B', 'C'].

  • base-model 2 will be fitted using the feature subset ['D', 'E', 'F', 'G'].

  • base-model 3 will be fitted using the feature subset ['G', 'H', 'I', 'J'].

  • finally the ensemble_classifier based on majority voting of the sub-models output.

That's, I when I make the call to:

ensemble_clssifier.fit(X_train, y_train)

It proceeds like so:

# 1st base model on fitted on its feature subset
model.fit(X_train[['A', 'B', 'C']], y_train)
# 2nd base model
model.fit(X_train[['D', 'E', 'F', 'G']], y_train)
# 3rd model also
model.fit(X_train[['G', 'H', 'I', 'J']], y_train)

This scenario should apply as well during prediction, making sure each base model selects the appropriate feature subset from X_test to make its prediction on ensemble_clssifier.fit(X_test) before the final voting.

I am not sure how to proceed. Any ideas?

EDIT

Regarding this question, I made some changes (e.g. not using the VotingClassifier) to further train the final ensemble on the output of the base models (base models confidences). Then finally make predictions.

I created the following ensemble class:

from sklearn.base import clone

class CustomEnsemble:
    def __init__(self, base_model, best_feature_subsets):
        self.base_models = {class_label: clone(base_model) for class_label in best_feature_subsets}
        self.best_feature_subsets = best_feature_subsets
        self.final_model = base_model

    def train_base_models(self, X_train, y_train):
        for class_label, features in self.best_feature_subsets.items():
            model = self.base_models[class_label]
            model.fit(X_train[features], (y_train == class_label))
        
        return self
    
    def train_final_model(self, X_train, y_train):
        """
        Probably better to implement the train methods (base models & ensemble)
        in one method suc as the train_base_models  altogether.
        """
        predictions = pd.DataFrame()

        for class_label, model in self.base_models.items():
            predictions[class_label] = model.predict_proba(X_train[self.best_feature_subsets[class_label]])[:, 1]

        self.final_model.fit(predictions, y_train)


    def predict_base_models(self, X_test):
        predictions = pd.DataFrame()

        for class_label, model in self.base_models.items():
            predictions[class_label] = model.predict_proba(X_test[self.best_feature_subsets[class_label]])[:, 1]

        return predictions

    def predict(self, X_test):
        base_model_predictions = self.predict_base_models(X_test)
        return self.final_model.predict(base_model_predictions)

    def predict_proba_base_models(self, X_test):
        predictions = pd.DataFrame()

        for class_label, model in self.base_models.items():
            predictions[class_label] = model.predict_proba(X_test[self.best_feature_subsets[class_label]])[:, 1]

        return predictions

    def predict_proba(self, X_test):
        base_model_predictions = self.predict_proba_base_models(X_test)
        return self.final_model.predict_proba(base_model_predictions)

Usage:

  1. Define dictionary of best feature subsets for classes:
optimal_features = {
    0: ['A', 'B', 'C'],

    1: ['D', 'E', 'F', 'G'],

    2: ['G', 'H', 'I', 'J']
}

  1. Instantiate class and train models:
classifier = RandomForestClassifier()
ensemble   = CustomEnsemble(classifier, optimal_features)
  1. Train models:
# first, train base models
ensemble.train_base_models(X_train, y_train)
# then, train the ensemble
ensemble.train_final_model(X_train, y_train)
  1. Make predictions:
yhat = ensemble.predict(X_test)
yhat_proba = ensemble.predict_proba(X_test) # so as to calculate roc_auc_score() 
  1. However, it appears I am not doing things right. I am not training the ensemble on the output of base models, but on the original input features.

  2. Also, I am not sure if separating train_base_models() and train_final_model() is the best approach (this implies fitting twice: base models then final model as in the usage). Or better to combine these into one method (say train_ensemble()).

2

There are 2 best solutions below

0
Ben Reiniger On

You just need some estimator that will allow a prespecified list of features to select; there might be multiple options, but one is a ColumnTransformer:

selector_1 = ColumnTransformer([
    ("select", "passthrough", ['A', 'B', 'C']),
])
0
Rahees Ahmed On

Here's what I did to give your model a boost:

Mixing up the models: Instead of sticking with just RandomForest, I threw in a mix of different models like DecisionTree, RandomForest, SVC, KNeighbors, and GradientBoosting. It's like getting the best of all worlds, which should help the ensemble get smarter at making predictions.

Tuning the Decision Tree: I used GridSearchCV to fine-tune the Decision Tree. It's like finding the sweet spot where the tree is just right - not too simple, not too complex.

Dealing with imbalanced data: If your data has more of one class than others, it can make the model biased. So, I used SMOTE to balance things out. This way, all classes get equal attention.

Custom Ensemble Approach: Instead of the usual VotingClassifier, I put together a custom ensemble class. Each model in this class focuses on specific features you've chosen. This way, each model becomes an expert in its own area.

Checking the model's performance: After training, I checked how well the model did using metrics like accuracy, precision, recall, and F1 score. These are like report cards for the model, showing us how well it's doing.

Accuracy: 0.94
Precision: 0.94
Recall: 0.94
F1 Score: 0.94
Confusion Matrix:
[[135   9]
 [  9 147]]

The cool part is, the new ensemble model got an accuracy of 0.94, way better than the original 0.4633. This means it's doing a great job at making the right predictions.

Model Diversity: Vector Machines, K-Nearest Neighbors, or Gradient Boosting classifiers

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

# Generating a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=42)
feature_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
X = pd.DataFrame(X, columns=feature_names)

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optimizing the Decision Tree using GridSearchCV
param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 4, 6]}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_decision_tree = grid_search.best_estimator_

# New base models with added diversity
base_models = [
    best_decision_tree,
    RandomForestClassifier(),
    SVC(probability=True),  # SVM
    KNeighborsClassifier(),  # KNN
    GradientBoostingClassifier()  # Gradient Boosting
]

# Assuming feature_subsets remains the same
feature_subsets = {
    0: ['A', 'B', 'C'],
    1: ['D', 'E', 'F', 'G'],
    2: ['G', 'H', 'I', 'J']
}

# Train and test the custom ensemble model
ensemble = CustomEnsemble(base_models, feature_subsets)
ensemble.train_ensemble(X_train_smote, y_train_smote)
y_pred = ensemble.predict(X_test)

# Evaluating performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Let me know if you have any questions.

from start to end testing you can follow the code on Google Collab.