Keep Vector store persistent even after close browser tab (Document Question and Answer Chatbot)

16 Views Asked by At

Here's my python file which is a streamlit app that you can upload documents and then question and answer, basically a standard ask document chatbot.

how should i modify it so that even if i close the browser tab and open it again, the vector store is saved and the user doesn't have to re-upload the files? in fact, the user is able to continously upload more files.

import os
from apikey import apikey
import streamlit as st 

from langchain.chat_models import ChatOpenAI 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings.openai import OpenAIEmbeddings 
from langchain.vectorstores import Chroma 
from langchain.chains import ConversationalRetrievalChain

# Set the OpenAI API key in the environment variables for authentication.
os.environ["OPENAI_API_KEY"] = apikey

# Define a function to clear the conversation history stored in Streamlit's session state.
def clear_history():
   if 'history' in st.session_state:
      del st.session_state['history']

# Set the title of the web page displayed to the user.
st.title('Chat with Document')

# Create a file uploader widget allowing users to upload documents in PDF, DOCX, or TXT format.
uploaded_file = st.file_uploader('Upload file:',type=['pdf','docx', 'txt'])

# Create a button that, when clicked, triggers the clear_history function to reset the session.
add_file = st.button('Add File', on_click=clear_history)

# Check if a file has been uploaded and the 'Add File' button has been pressed.
if uploaded_file and add_file:
    with st.spinner('Reading, chunking and embedding file...'):
        # Read the uploaded file's content as bytes.
        bytes_data = uploaded_file.read()
        # Construct a file path to save the uploaded file temporarily.
        file_name = os.path. join('./', uploaded_file.name)
        # Save the uploaded file to the constructed path.
        with open (file_name, 'wb') as f:
            f.write(bytes_data)

        # Determine the file's extension to decide on the appropriate loader.
        name, extension = os.path.splitext(file_name)

        # Select the loader based on the file extension.
        if extension == '.pdf':
            from langchain.document_loaders import PyPDFLoader
            loader = PyPDFLoader(file_name)
        elif extension == '.docx':
            from langchain.document_loaders import Docx2txtLoader
            loader = Docx2txtLoader(file_name)
        elif extension == '.txt':
            from langchain.document_loaders import TextLoader 
            loader = TextLoader(file_name)
        else:
            st.write('Document format is not supported!')
        
        # Load the document using the selected loader.
        documents = loader.load()

        # Initialize the text splitter and split the loaded document into manageable chunks.
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_documents(documents)
        # Initialize embeddings and vector store for the document chunks.
        embeddings = OpenAIEmbeddings()
        vector_store = Chroma.from_documents(chunks, embeddings)

        # Initialize a ChatOpenAI instance with GPT-3.5 turbo model and temperature set to 0 for objective responses.
        llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
        # Create a retriever from the vector store for document retrieval.
        retriever=vector_store.as_retriever()

        # Initialize a conversational retrieval chain with the language model and the retriever.
        crc = ConversationalRetrievalChain.from_llm(llm, retriever)

        # Store the initialized conversational retrieval chain in Streamlit's session state.
        st.session_state.crc = crc

        # Display a success message once the file has been processed.
        st.success('File uploaded, chunked and embedded successfully')

# Create an input widget for users to type in their questions.
question = st.text_input('Input your question')

# Process the question if it has been asked.
if question:
    if 'crc' in st.session_state:
        crc = st.session_state.crc

        # Initialize the chat history in session state if it doesn't exist.
        if 'history' not in st.session_state:
            st.session_state['history'] = []
        
        # Run the conversational retrieval chain with the current question and chat history.
        response = crc.run({
            'question':question, 
            'chat_history': st.session_state['history']
        })
        
        # Append the current question and response to the chat history.
        st.session_state['history'].append((question,response)) 
        # Display the response.
        st.write(response)
        # Iterate through the chat history to display previous questions and answers.
        for prompts in st.session_state['history']:
            st.write("Question: " + prompts[0]) 
            st.write("Answer: " + prompts[1])
0

There are 0 best solutions below