Preserving DataFrame Modifications Across Options in a Streamlit Application

32 Views Asked by At

I have a Streamlit application called PredictaApp that allows users to upload a CSV file and perform various data analysis and preprocessing tasks on the uploaded dataset. One of the tasks is imputing missing values in the dataset, which is handled by the DataImputer class in the missing_data module.

predicta.py->

import streamlit as st
import pandas as pd
from DataExplore import explore
from FeatureCleaning import missing_data, outlier
from chat import ChatPredicta
from MLModel import predictmlalgo
from codeditor import PredictaCodeEditor
import theme


class PredictaApp:
    def __init__(self):
        self.df = None
        self.anthropi_api_key = None

    def show_hero_image(self):
        st.image("Hero.png")

    def show_footer(self):
        st.markdown("---")
        footer = "*copyright@infinitequants*"
        st.markdown(footer)

        footer_content = """
        <div class="footer">
            Follow us: &nbsp;&nbsp;&nbsp;
            <a href="https://github.com/ahammadnafiz" target="_blank">GitHub</a>  |
            <a href="https://twitter.com/ahammadnafi_z" target="_blank">Twitter</a> 
        </div>
        """
        st.markdown(footer_content, unsafe_allow_html=True)
    
    def file_upload(self):
        uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
        if uploaded_file is not None:
            self.data = pd.read_csv(uploaded_file)
            self.df = self.data.copy(deep=True)
            
    def handle_sidebar(self):
        st.sidebar.title("Predicta")
        st.sidebar.markdown("---")

        self.file_upload()

        with st.sidebar:
            self.anthropi_api_key = st.text_input(
                "Anthropic API Key", key="file_qa_api_key", type="password"
            )
            "[Get an Anthropic API key](https://console.anthropic.com/)"

        st.sidebar.title("Tools")
        selected_option = st.sidebar.radio(
            "Select Option",
            [
                "Data Explore",
                "Impute Missing Values",
                "Detect Outlier",
                "Chat With Predicta",
                "PredictaCodeEditor",
                "Select ML Models",
            ],
        )
        if selected_option == "Data Explore":
            self.handle_data_explore()
        elif selected_option == "Impute Missing Values":
            self.handle_impute_missing_values()
        elif selected_option == "Detect Outlier":
            self.handle_detect_outlier()
        elif selected_option == "Chat With Predicta":
            self.handle_chat_with_predicta()
        elif selected_option == "PredictaCodeEditor":
            self.code_editor()
        elif selected_option == "Select ML Models":
            self.handle_select_ml_models()
            
        st.sidebar.markdown("---")
        self.handle_about()
        self.handle_help()

    def handle_about(self):
        st.sidebar.markdown("#### About")
        st.sidebar.info("Predicta is a powerful data analysis and machine learning tool designed to streamline your workflow and provide accurate predictions.")

    def handle_help(self):
        st.sidebar.markdown("#### Help")
        st.sidebar.info("For any assistance or inquiries, please contact us at [email protected].")

    def handle_data_explore(self):
        if self.df is not None:
            analysis = explore.DataAnalyzer(self.df)
            analysis.analyzer()
        else:
            st.markdown(
                "<div style='text-align: center; margin-top: 20px; margin-bottom: 20px; font-size: 15px;'>Please upload a dataset to Explore.</div>",
                unsafe_allow_html=True,
            )
            st.image("uploadfile.png", use_column_width=True)

    def handle_impute_missing_values(self):
        if self.df is not None:
            impute = missing_data.DataImputer(self.df)
            impute.imputer()
        else:
            st.markdown(
                "<div style='text-align: center; margin-top: 20px; margin-bottom: 20px; font-size: 15px;'>Please upload a dataset to perform feature cleaning.</div>",
                unsafe_allow_html=True,
            )
            st.image("uploadfile.png", use_column_width=True)

    def handle_detect_outlier(self):
        if self.df is not None:
            out = outlier.OutlierDetector(self.df)
            out.outlier_detect()
        else:
            st.markdown(
                "<div style='text-align: center; margin-top: 20px; margin-bottom: 20px; font-size: 15px;'>Please upload a dataset to detect outlier.</div>",
                unsafe_allow_html=True,
            )
            st.image("uploadfile.png", use_column_width=True)

    def handle_chat_with_predicta(self):
        chat_page = ChatPredicta(self.df, self.anthropi_api_key)
        chat_page.chat_with_predicta()

    def code_editor(self):
        editor = PredictaCodeEditor()
        editor.run_code_editor(self.df)
        
    def handle_select_ml_models(self):
        if self.df is not None:
            model = predictmlalgo.PredictAlgo(self.df)
            model.algo()
        else:
            st.markdown(
                "<div style='text-align: center; margin-top: 20px; margin-bottom: 20px; font-size: 15px;'>Please upload a dataset to Perform Prediction.</div>",
                unsafe_allow_html=True,
            )
            st.image("uploadfile.png", use_column_width=True)

    def run(self):
        self.show_hero_image()
        self.handle_sidebar()
        self.show_footer()


if __name__ == "__main__":
    st.set_page_config(
        page_title="Predicta",
        page_icon="",
        initial_sidebar_state="expanded"
    )
    theme.footer()
    
    app = PredictaApp()
    app.run()

missing_data.py->

import pandas as pd
import numpy as np
import logging
import streamlit as st
from predicta import PredictaApp

class DataImputer(PredictaApp):
    def __init__(self, data):
        super().__init__()
        if not isinstance(data, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")
        self.data = data
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        self.logger.addHandler(logging.StreamHandler())

    def check_missing(self, output_path=None):
        try:
            result = pd.concat([self.data.isnull().sum(), self.data.isnull().mean()], axis=1)
            result = result.rename(index=str, columns={0: 'total missing', 1: 'proportion'})
            
            if output_path is not None:
                result.to_csv(output_path + 'missing.csv')
                self.logger.info('Result saved at %smissing.csv', output_path)
            return result
        except Exception as e:
            self.logger.error("An error occurred while checking missing values: %s", str(e))
            raise

    def drop_missing(self, axis=0):
        try:
            original_shape = self.data.shape
            self.data = self.data.dropna(axis=axis)
            if self.data.shape == original_shape:
                return None  
            else:
                return self.data
        except Exception as e:
            self.logger.error("An error occurred while dropping missing values: %s", str(e))
            raise

    def add_var_denote_NA(self, NA_col=[]):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    self.data[i] = np.where(self.data[i].isnull(), 1, 0)
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while adding variable to denote NA: %s", str(e))
            raise

    def impute_NA_with_arbitrary(self, impute_value, NA_col=[]):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    self.data[i].fillna(impute_value, inplace=True)
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with arbitrary value: %s", str(e))
            raise

    def impute_NA_with_avg(self, strategy='mean', NA_col=[]):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    if strategy == 'mean':
                        self.data[i].fillna(self.data[i].mean(), inplace=True)
                    elif strategy == 'median':
                        self.data[i].fillna(self.data[i].median(), inplace=True)
                    elif strategy == 'mode':
                        self.data[i].fillna(self.data[i].mode()[0], inplace=True)
                    return self.data
                    
                else:
                    self.logger.warning("Column %s has no missing", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with average: %s", str(e))
            raise

    def impute_NA_with_end_of_distribution(self, NA_col=[]):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    self.data[i].fillna(self.data[i].mean() + 3 * self.data[i].std(), inplace=True)
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with end of distribution: %s", str(e))
            raise

    def impute_NA_with_random(self, NA_col=[], random_state=0):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    random_sample = self.data[i].dropna().sample(self.data[i].isnull().sum(), random_state=random_state)
                    random_sample.index = self.data[self.data[i].isnull()].index
                    self.data.loc[self.data[i].isnull(), i] = random_sample
                    return self.data
                
                else:
                    self.logger.warning("Column %s has no missing", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with random sampling: %s", str(e))
            raise

    def impute_NA_with_interpolation(self, method='linear', limit=None, limit_direction='forward', NA_col=[]):
        try:
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    self.data[i] = self.data[i].interpolate(method=method, limit=limit, limit_direction=limit_direction)
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with interpolation: %s", str(e))
            raise

    def impute_NA_with_knn(self, NA_col=[], n_neighbors=5):
        try:
            from sklearn.impute import KNNImputer
            knn_imputer = KNNImputer(n_neighbors=n_neighbors)
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    imputed_values = knn_imputer.fit_transform(self.data[i].values.reshape(-1, 1))
                    self.data[i] = imputed_values.ravel()
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with KNN: %s", str(e))
            raise

    def impute_NA_with_mice(self, NA_col=[], n_iterations=10):
        try:
            from impyute.imputation.cs import mice
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    imputed_data = mice(self.data[i].values.reshape(1, -1), n_iterations=n_iterations)
                    self.data[i] = imputed_data.T.ravel()
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with MICE: %s", str(e))
            raise

    def impute_NA_with_missforest(self, NA_col=[], n_estimators=100, max_depth=None):
        try:
            from missingpy import MissForest
            imputer = MissForest(n_estimators=n_estimators, max_depth=max_depth)
            for i in NA_col:
                if self.data[i].isnull().sum() > 0:
                    imputed_data = imputer.fit_transform(self.data[i].values.reshape(-1, 1))
                    self.data[i] = imputed_data.ravel()
                    return self.data
                else:
                    self.logger.warning("Column %s has no missing cases", i)
        except Exception as e:
            self.logger.error("An error occurred while imputing NA with MissForest: %s", str(e))
            raise

    def imputer(self):
        
        st.markdown(
    "<h1 style='text-align: center; font-size: 30px;'>Impute Missing Values</h1>", 
    unsafe_allow_html=True
)
        st.markdown("---")
        st.markdown("<h2 style='text-align: center; font-size: 20px;'>Original Dataset</h1>", unsafe_allow_html=True)
        st.dataframe(self.data, width=800)

        
        option = st.selectbox("Select an Imputation Method", [
            "Check Missing Values",
            "Drop Missing Values",
            "Add Variable to Denote NA",
            "Impute NA with Arbitrary Value",
            "Impute NA with Interpolation",
            "Impute NA with KNN",
            "Impute NA with MICE",
            "Impute NA with MissForest",
            "Impute NA with Average",
            "Impute NA with End of Distribution",
            "Impute NA with Random Sampling"
        ])

        if option == "Check Missing Values":
            if st.button("Check"):
                self.check_missing()
                st.write(self.check_missing())

        elif option == "Drop Missing Values":
            axis = st.radio("Drop rows or columns?", ["Rows", "Columns"])
            axis = 0 if axis == "Rows" else 1
            if st.button("Drop"):
                self.drop_missing(axis=axis)
                if self.data is not None:
                    st.dataframe(self.data)
                else:
                    st.warning("No missing values found in the data.")

        elif option == "Add Variable to Denote NA":
            selected_columns = st.multiselect("Select columns to impute", options=self.data.columns)
            if st.button("Add"):
                if selected_columns:
                    data_add_var = self.add_var_denote_NA(NA_col=selected_columns)
                    st.write(data_add_var)
                else:
                    st.warning("Please select at least one column to impute")

        elif option == "Impute NA with Arbitrary Value":
            impute_value = st.text_input("Enter Arbitrary Value")
            na_cols = st.multiselect("Select Columns", self.data.columns)
            if st.button("Impute Arbitrary Value"):
                data_impute_arb = self.impute_NA_with_arbitrary(impute_value=float(impute_value), NA_col=na_cols)
                st.write(data_impute_arb)

        elif option == "Impute NA with Interpolation":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            interp_method = st.selectbox("Interpolation Method", ['linear', 'quadratic', 'cubic'])
            interp_limit = st.text_input("Limit", None)
            interp_limit_direction = st.selectbox("Limit Direction", ['forward', 'backward', 'both'])
            if st.button("Impute Interpolation"):
                data_interp = self.impute_NA_with_interpolation(method=interp_method, limit=interp_limit, limit_direction=interp_limit_direction, NA_col=na_cols)
                st.write(data_interp)

        elif option == "Impute NA with KNN":
            n_neighbors = st.number_input("Number of Neighbors", min_value=1, value=5)
            selected_columns = st.multiselect("Select columns to impute", options=self.data.columns)
            if st.button("Impute KNN"):
                if selected_columns:  # Check if at least one column is selected
                    data_knn = self.impute_NA_with_knn(NA_col=selected_columns, n_neighbors=n_neighbors)
                    st.write(data_knn)
                else:
                    st.warning("Please select at least one column to impute")

        elif option == "Impute NA with MICE":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            n_iterations = st.number_input("Number of Iterations", min_value=1, value=10)
            if st.button("Impute MICE"):
                data_mice = self.impute_NA_with_mice(NA_col=na_cols, n_iterations=n_iterations)
                st.write(data_mice)

        elif option == "Impute NA with MissForest":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            n_estimators = st.number_input("Number of Estimators", min_value=1, value=100)
            max_depth = st.text_input("Max Depth", None)
            if st.button("Impute MissForest"):
                data_missforest = self.impute_NA_with_missforest(NA_col=na_cols, n_estimators=n_estimators, max_depth=max_depth)
                st.write(data_missforest)

        elif option == "Impute NA with Average":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            strategy = st.selectbox("Imputation Strategy", ['mean', 'median', 'mode'])
            if st.button("Impute Average"):
                data_avg = self.impute_NA_with_avg(strategy=strategy, NA_col=na_cols)
                st.write(data_avg)

        elif option == "Impute NA with End of Distribution":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            if st.button("Impute End of Distribution"):
                data_end_dist = self.impute_NA_with_end_of_distribution(NA_col=na_cols)
                st.write(data_end_dist)

        elif option == "Impute NA with Random Sampling":
            na_cols = st.multiselect("Select Columns", self.data.columns)
            random_state = st.number_input("Random State", min_value=0, value=0)
            if st.button("Impute Random"):
                data_random = self.impute_NA_with_random(NA_col=na_cols, random_state=random_state)
                st.write(data_random)
        
        return self.data

I want that when I do some modification in missing_data.py I also modify the original df.

The issue I'm facing is that when I switch between different options (e.g., Data Explore, Impute Missing Values, Detect Outlier, etc.) in the Streamlit application, the modifications made to the DataFrame in the previous option are not preserved. Specifically, when I impute missing values using the DataImputer class and then switch to a different option, the original unmodified DataFrame is loaded again, overwriting the changes made by the imputation process.

I want to ensure that the modifications made to the DataFrame in one option (e.g., Impute Missing Values) are carried over to the other options (e.g., Detect Outlier, Select ML Models, etc.), so that subsequent operations are performed on the modified DataFrame, rather than the original unmodified DataFrame.

In summary, the problem statement is: "How to preserve the modifications made to the DataFrame across different options in the Streamlit application, specifically after imputing missing values using the DataImputer class."

I tried modifying the imputer method in the DataImputer class to return the modified DataFrame, and then assign it back to self.df in the PredictaApp class. I expected this to ensure that the modifications made to the DataFrame in the DataImputer class would be reflected in self.df in the PredictaApp class, and subsequent operations on the DataFrame would use the modified data across different options.

However, even after making these changes, when I switched to a new option in Streamlit, the original unmodified DataFrame was being loaded again, overwriting the modifications made in the previous option.

0

There are 0 best solutions below