I have below code for building predictive models for 10 clusters, finding best r2 score for each cluster for each variable and saving best model for each cluster for each variable in a pkl file. First part of my code is working well. In the last part (# Find the model with the highest R-squared score for each cluster and each variable), where it's identifying and saving the best model for each cluster of each variable, it only picks up the r2 score for 1 variable(Total Deaths) not both variables. How to get r2 score for each cluster of each type of variables.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data)

# Clustering
# You can use K-Means clustering to create clusters based on the data.
n_clusters = 10  # Choose the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_features)

# Split the data into clusters
clustered_data = []
for cluster_id in range(n_clusters):
    cluster = data[data['Cluster'] == cluster_id]
    clustered_data.append(cluster)

#for checking r2 score for each cluster for each model


# Build Predictive Models for Each Cluster
target_variables = ["Total Damage ('000 US$)", "Total Deaths"]

models = []

for target_variable in target_variables:
    for cluster_id, cluster in enumerate(clustered_data):
        X = cluster.drop(columns=target_variables)  # Features
        y = cluster[target_variable]  # Target variable

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a predictive model (e.g., Random Forest)
        random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
        random_forest_model.fit(X_train, y_train)
        models.append((random_forest_model, cluster_id, 'RandomForestRegressor', target_variable))

        # Train Linear Regression model
        linear_regression_model = LinearRegression()
        linear_regression_model.fit(X_train, y_train)
        models.append((linear_regression_model, cluster_id, 'LinearRegression', target_variable))
        
        # Train Decision Tree model
        decision_tree_model = DecisionTreeRegressor(random_state=42)
        decision_tree_model.fit(X_train, y_train)
        models.append((decision_tree_model, cluster_id, 'DecisionTreeRegressor', target_variable))

        # Train SVM model
        svm_model = SVR()
        svm_model.fit(X_train, y_train)
        models.append((svm_model, cluster_id, 'SVR', target_variable))

        # Train MLP model
        mlp_model = MLPRegressor(max_iter=6200, random_state=42)
        mlp_model.fit(X_train, y_train)
        models.append((mlp_model, cluster_id, 'MLPRegressor', target_variable))

        # Train KNN model
        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)
        models.append((knn_model, cluster_id, 'KNeighborsRegressor', target_variable))

# Make Predictions
all_predictions = {model_name: {target_variable: [] for target_variable in target_variables} for _, _, model_name, target_variable in models}

for model, cluster_id, model_name, target_variable in models:
    cluster = data[data['Cluster'] == cluster_id]
    X = cluster.drop(columns=target_variables)
    predictions = model.predict(X)
    all_predictions[model_name][target_variable].append(predictions)

# Combine Predictions
for target_variable in target_variables:
    y_true = data[target_variable]
    
    for model_name, predictions_dict in all_predictions.items():
        predictions_list = predictions_dict[target_variable]
        
        for i, predictions in enumerate(predictions_list):
            cluster_indices = data['Cluster'] == i
            cluster_true_values = y_true[cluster_indices]
            cluster_r2 = r2_score(cluster_true_values, predictions)
            
            print(f"\nModel: {model_name} - Target Variable: {target_variable} - Cluster {i}")
            print(f"R-squared: {cluster_r2}")

# Find the model with the highest R-squared score for each cluster and each variable
best_models = {target_variable: {cluster_id: {'model': None, 'r2_score': float('-inf')} 
                                 for cluster_id in range(len(clustered_data))} 
               for target_variable in target_variables}

for target_variable in target_variables:
    for model, cluster_id, model_name, _ in models:
        cluster = data[data['Cluster'] == cluster_id]
        X = cluster.drop(columns=target_variables)
        predictions = model.predict(X)
        
        # Filter the data based on the cluster_id
        cluster_indices = data['Cluster'] == cluster_id

        # Extract the target variable name
        target_variable_name = target_variable.split("(")[0].strip()  # Remove parentheses and spaces

        # Find the closest matching column name
        closest_column = min(X.columns, key=lambda col: target_variable_name.lower() in col.lower())

        cluster_r2 = r2_score(y_true[cluster_indices], predictions)



        if cluster_r2 > best_models[target_variable][cluster_id]['r2_score']:
            best_models[target_variable][cluster_id]['model'] = model
            best_models[target_variable][cluster_id]['r2_score'] = cluster_r2

# Print and save the best models
for target_variable in target_variables:
    for cluster_id, values in best_models[target_variable].items():
        best_model = values['model']
        best_r2_score = values['r2_score']

        print(f"\nBest R-squared Score for Target Variable: {target_variable} - Cluster {cluster_id}: {best_r2_score}")
        print(f"Best Model: {type(best_model).__name__}")

        # Save Best Model as PKL file
        model_path = f'/Users/fahminaahmed/your_project/{target_variable}_Cluster_{cluster_id}.pkl'
        joblib.dump(best_model, model_path)
        print(f"Best Model saved to {model_path}")

Trying to identify best R squared score for each cluster of each variable and print them out and save as pkl file.

0

There are 0 best solutions below