How to Avoid Duplicate Selection in Combinatorial Optimization Problems Using Genetic Algorithms

46 Views Asked by At

Abstracts I am currently developing a program that uses a genetic algorithm to solve a combinatorial optimization problem. The objective is to select a control group of 2000 people from 5000 people with attribute values that are closest to the test group. However, when I run the program, I am experiencing problems with the same person being selected multiple times.

Issue Details

  • I am using the code below and it is not working as expected. Sometimes certain individuals are selected more than once.
  • Attempts to solve the problem: Questions to ChatGPT, consulting an engineer who is not an algorithm expert.

Goal we are trying to achieve

  • We want to find a way to prevent the same person from being selected multiple times.
  • We are looking for a way to solve the problem without compromising computational efficiency.

Question

  • How can I prevent the same person from being selected more than once in a program using a genetic algorithm?

Code

import pandas as pd
import numpy as np
from deap import base, creator, tools, algorithms
import random
import time

# Dictionary of configuration variables
CONFIG = {
    "file_path": 'C:/Python用/test1_GENIE.csv', #File path of read data
    "population_size": 1000, 
    "generations": 100, 
    "cxpb": 0.5, 
    "mutpb": 0.2, 
    "n_control": 2000, 
    "pass_threshold": 0.05, 
    "output_path": 'outputpath' 
}

# File path
file_path = CONFIG["file_path"]

# data load
data = pd.read_csv(file_path)

# Separate data into test and control groups
test_group = data[data['is_test'] == 1]
control_group = data[data['is_test'] == 0]

# Calculation of mean value agreement for each column
def calculate_match_rates_vectorized(test_group, control_group, columns):
    # Calculate the mean of the test and control groups for each column
    means_test = test_group[columns].mean()
    means_control = control_group[columns].mean()

    # Replace zero values with smaller numbers to avoid dividing by zero
    means_control[means_control == 0] = np.finfo(float).eps

    # Calculation of Match Ratio
    match_rates = abs(1 - (means_test / means_control))
    return match_rates


# Calculation of the evaluation function
columns = data.columns[3:]

# Genetic Algorithm Parameters
POPULATION_SIZE = CONFIG["population_size"]
GENERATIONS = CONFIG["generations"]
CXPB = CONFIG["cxpb"]
MUTPB = CONFIG["mutpb"]
N_CONTROL = CONFIG["n_control"]

# objective function
def evaluate(individual):
    selected_control_group = control_group.iloc[individual]
    match_rates = calculate_match_rates_vectorized(test_group, selected_control_group, columns)
    max_match_rate = max(match_rates)
    return (max_match_rate,)


# Genetic Algorithm Initialization
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("indices", random.sample, range(len(control_group)), N_CONTROL)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.indices)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Record current time before algorithm execution
start_time = time.time()

# Running Genetic Algorithms
def main():
    random.seed(64)
    pop = toolbox.population(n=POPULATION_SIZE)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=GENERATIONS, 
                                   stats=stats, halloffame=hof, verbose=True)

    # Record the time after the algorithm is executed and calculate the execution time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"run time: {elapsed_time} sec")
    
    # Output the index of the best individuals of the last generation to a CSV file
    best_individual = hof[0]
    best_control_group = control_group.iloc[best_individual]
    output_path = CONFIG["output_path"]
    best_control_group.to_csv(output_path, index=False)

    # Display of success message
    print(f"Data has been successfully saved. Save to: {output_path}")
    
    return pop, log, hof

if __name__ == "__main__":
    pop, log, hof = main()
0

There are 0 best solutions below