Bayesian Network [Variable Elimination]: merge and groupby memory crash using pandas

79 Views Asked by At

Tried to speed up my functions and make them more memory efficient for variable elimination algorithm on Bayesian Network but it will still crash once the dataframe gets too big.

I have created a sample script to play arround. You can adjust the column size etc as indicated.

Please note: pre-processing and heuristics to prevent table size to explode are not given in the example.

import numpy as np
import pandas as pd
import itertools
import time

# Adjust amount of dataframes here:
how_many_dataframes = 20 

def make_array(how_many_dataframes):
    dic = {}
    for i in range(how_many_dataframes):

# Adjust possible range of column sizes here
        column_size = np.random.choice(list([4,5,6,7,8,9,10,11,12,13,14,15,16])) 

        combinations = list(itertools.product([0, 1], repeat=column_size))
        bool_columns = np.array(combinations, dtype=int)
        random_values = np.random.uniform(0, 0.5, size=(2**column_size, 1))
        array = np.concatenate((bool_columns, random_values), axis=1)
        letters = np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')) # Adjust possible column names here
        unique_columns = set()
        while len(unique_columns) < column_size:
            unique_columns.add(''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), size=1)))
        df = pd.DataFrame(array, columns=list(unique_columns) + ['prob'])
        dic[letters] = df
        
    return dic

network = make_array(how_many_dataframes)


def merge_factors(network, elim_order):   
    print("###########################################")
    print(f"Elimination order: {elim_order}")
    print(f"Elimination order length: {len(elim_order)}")
    print("###########################################") 
    eliminate_node = elim_order[0] # note: this is the node to be eliminated
    factor_list = [] # list to store the relevant factors

    for network_key, network_df in network.items():  
        if elim_order[0] in network_df.columns:
            factor_list.append(network_key) # append the relevant factors to the list

    while len(factor_list) > 1: # as long as there are more than one relevant factors
        factor_1, factor_2 = factor_list[:2]
        merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2]] # merge on all columns except for the probability column
    
        network[factor_1].sort_values(by="prob", inplace=True)
        network[factor_2].sort_values(by="prob", inplace=True)

        start_time = time.time()
        print(f"Merging {factor_1} and {factor_2} on {merge_on}")
        print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
        print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
        print(f"Starting merge ...")
        network[factor_2] = pd.merge(network[factor_1], network[factor_2], on=merge_on, suffixes=('_f1', '_f2'), how='inner') # merge the two relevant factors
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Merging complete: {elapsed_time} seconds")

        network[factor_2]['prob'] = network[factor_2]['prob_f1'].mul(network[factor_2]['prob_f2']) # calculate the new probability
        print(f"Calculate new probability complete")
        print(f"Drop redundant columns ...")
        network[factor_2].drop(columns=['prob_f1', 'prob_f2'], inplace = True) # drop the old probability columns
        print(f"Drop redundant columns complete")
        #print(f"Merged frame: {network[factor_2]}")
        print(f"Deleting {factor_1} ...")
        del network[factor_1] # delete the factor that was merged
        print(f"Delete complete")

        factor_list.pop(0)
    
    elim_order.pop(0)
    return eliminate_node

def find_relevant_factor(network, node):
    for network_key, network_df in network.items():  
        if node in network_df.columns:
            relevant_factors = network_key
            return relevant_factors
    return None
        

def marginalize(node, relevant_factor_key, network):

    relevant_factor = network[relevant_factor_key]
    print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")

    if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
        relevant_factor = relevant_factor.drop(columns=[node])
        #relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist(), as_index=False).agg({relevant_factor.columns[-1]: 'sum'})
        relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist())["prob"].sum().reset_index()
        print(f"Marginalization complete")
        #print(f"Marginalized frame: {relevant_factor}")
        network[relevant_factor_key] = relevant_factor
    return

# Adjust elimination order here:
elim_order = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"] # Adjust elimination order here

while len(elim_order) > 0: # Main loop: Continues to multiply and marginalize factors until all variables have been eliminated
        eliminate_node = merge_factors(network, elim_order)
        relevant_factor = find_relevant_factor(network, eliminate_node)
        if relevant_factor is not None:
            start_time = time.time()
            marginalize(eliminate_node, find_relevant_factor(network, eliminate_node), network)
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Elapsed time for marginalization: {elapsed_time} seconds")

Any suggestions to improve operation performance of merge/marginalization?

Read about swiftly.apply but couldn't implement it succesfully so far. Switching to numpy for the operations seems chunky.

1

There are 1 best solutions below

4
Jérôme Richard On

You reach the limit of what Pandas can do in term of performance. While doing the operations manually in a faster way is possible, it is pretty difficult in Python, especially with a merge done on many columns. One solution is to use Polars instead which is often significantly faster because it can use multiple cores. In this case, Polar seems to be 4-6 times faster on most functions on my 6 core machines (pretty good). However, note that Polar functions are not exactly the same than the ones in Pandas.

First of all, in make_array, you just need to replace pd by pl and use import polars as pl in the beginning of the Python script.

Then, for the merge_factors function, the join, the column drop and the sort needs to be slightly modified. Here is an example:

def merge_factors(network, elim_order):   
    print("###########################################")
    print(f"Elimination order: {elim_order}")
    print(f"Elimination order length: {len(elim_order)}")
    print("###########################################") 
    eliminate_node = elim_order[0] # note: this is the node to be eliminated
    factor_list = [] # list to store the relevant factors

    for network_key, network_df in network.items():  
        if elim_order[0] in network_df.columns:
            factor_list.append(network_key) # append the relevant factors to the list

    while len(factor_list) > 1: # as long as there are more than one relevant factors
        factor_1, factor_2 = factor_list[:2]
        merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2].columns] # merge on all columns except for the probability column

        network[factor_1].sort(by="prob", in_place=True)
        network[factor_2].sort(by="prob", in_place=True)

        start_time = time.time()
        print(f"Merging {factor_1} and {factor_2} on {merge_on}")
        print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
        print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
        print(f"Starting merge ...")
        print(factor_1, factor_2, merge_on)
        network[factor_2] = network[factor_1].join(network[factor_2], on=merge_on, suffix='_f2', how='inner').rename({'prob': 'prob_f1'}) # merge the two relevant factors
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Merging complete: {elapsed_time} seconds")

        network[factor_2]['prob'] = network[factor_2]['prob_f1'] * network[factor_2]['prob_f2'] # calculate the new probability
        print(f"Calculate new probability complete")
        print(f"Drop redundant columns ...")
        network[factor_2].drop_in_place('prob_f1') # drop the old probability columns
        network[factor_2].drop_in_place('prob_f2') # drop the old probability columns
        print(f"Drop redundant columns complete")
        #print(f"Merged frame: {network[factor_2]}")
        print(f"Deleting {factor_1} ...")
        del network[factor_1] # delete the factor that was merged
        print(f"Delete complete")

        factor_list.pop(0)
    
    elim_order.pop(0)
    return eliminate_node

As for the function marginalize, here is the modified version:

def marginalize(node, relevant_factor_key, network):
    relevant_factor = network[relevant_factor_key]
    print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")

    if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
        relevant_factor.drop_in_place(node)
        relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1])["prob"].sum().rename({'prob_sum': 'prob'})
        print(f"Marginalization complete")
        network[relevant_factor_key] = relevant_factor
    return

The code is barely tested but the behaviour seems the same and the performance is clearly improved. Indeed, the time of the whole script decreases from 45.3 seconds to 10.8 seconds. This means this new version is 4.2 times faster on my 6-core machine (i5-9600KF CPU).

I think the resulting code is still sub-optimal, but this is hard to do better in Python without deeply change the code and make it far more complex (certainly with a speed-up that will not worth the effort).

Note that the random seed needs to be set to compare results (including timings).

Alose please note that using 32-bit floats rather than 64-bit ones might be particularly well-suited here since probabilities hardly need >6 digits (relative) precision. This can speed-up the computation further and reduce the memory footprint.