How to upload document to pinecone index using langchain modules ask questions in document

367 Views Asked by At

I want to upload my document to pincone and ask the questions in the document. I want ask in terminal itself, I know there is streamlit videos are in youtube but i don't want to use Stream lit. I'm just chechking in python terminal thing. (Lorem just for stackoverlow conditions ljdlandlnaljdnlasndkjlanldn)

import os

# Initialize Pinecone
#pinecone.init(api_key="", environment="eu-west-gcp")

import os
import re
import pdfplumber
import openai
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"    
embed=OpenAIEmbeddings(
    model=model_name,
    openai_api_key="API KEY",   
)    

# Initialize OpenAI
OPENAI_API_KEY ='API-KEY'
MODEL = "text-embedding-ada-002"

# Initialize Pinecone
pinecone.init(api_key='API_KEY', environment='gcp-starter')

# Define the index name
index_name = "my-index"

# Create the index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)

# Instantiate the index
index = pinecone.Index(index_name)

# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines and tabs
    text = re.sub(r'\s+', ' ', text)
    return text

def process_pdf(file_path):
    # create a loader
    loader = PyPDFLoader(file_path)
    # load your data
    data = loader.load()
    # Split your data up into smaller documents with Chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(data)
    # Convert Document objects into strings
    texts = [str(doc) for doc in documents]
    return texts

# Define a function to create embeddings
def create_embeddings(texts):
    embeddings_list = []
    for text in texts:
        res =embed.embed_documents(text)
        
        embeddings_list.append(res['source'][0]['embedding'])
    return embeddings_list

# Define a function to upsert embeddings to Pinecone
def upsert_embeddings_to_pinecone(index, embeddings, ids):
    index.upsert(vectors=[(id, embedding) for id, embedding in zip(ids, embeddings)])

# Process a PDF and create embeddings
file_path = "./Sample.pdf"  # Replace with your actual file path
texts = process_pdf(file_path)
embeddings = create_embeddings(texts)

# Upsert the embeddings to Pinecone
upsert_embeddings_to_pinecone(index, embeddings, [file_path])```
0

There are 0 best solutions below