I want to upload my document to pincone and ask the questions in the document. I want ask in terminal itself, I know there is streamlit videos are in youtube but i don't want to use Stream lit. I'm just chechking in python terminal thing. (Lorem just for stackoverlow conditions ljdlandlnaljdnlasndkjlanldn)
import os
# Initialize Pinecone
#pinecone.init(api_key="", environment="eu-west-gcp")
import os
import re
import pdfplumber
import openai
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
model_name = "text-embedding-ada-002"
embed=OpenAIEmbeddings(
model=model_name,
openai_api_key="API KEY",
)
# Initialize OpenAI
OPENAI_API_KEY ='API-KEY'
MODEL = "text-embedding-ada-002"
# Initialize Pinecone
pinecone.init(api_key='API_KEY', environment='gcp-starter')
# Define the index name
index_name = "my-index"
# Create the index if it doesn't exist
if index_name not in pinecone.list_indexes():
pinecone.create_index(index_name, dimension=1536)
# Instantiate the index
index = pinecone.Index(index_name)
# Define a function to preprocess text
def preprocess_text(text):
# Replace consecutive spaces, newlines and tabs
text = re.sub(r'\s+', ' ', text)
return text
def process_pdf(file_path):
# create a loader
loader = PyPDFLoader(file_path)
# load your data
data = loader.load()
# Split your data up into smaller documents with Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(data)
# Convert Document objects into strings
texts = [str(doc) for doc in documents]
return texts
# Define a function to create embeddings
def create_embeddings(texts):
embeddings_list = []
for text in texts:
res =embed.embed_documents(text)
embeddings_list.append(res['source'][0]['embedding'])
return embeddings_list
# Define a function to upsert embeddings to Pinecone
def upsert_embeddings_to_pinecone(index, embeddings, ids):
index.upsert(vectors=[(id, embedding) for id, embedding in zip(ids, embeddings)])
# Process a PDF and create embeddings
file_path = "./Sample.pdf" # Replace with your actual file path
texts = process_pdf(file_path)
embeddings = create_embeddings(texts)
# Upsert the embeddings to Pinecone
upsert_embeddings_to_pinecone(index, embeddings, [file_path])```