#all the necessary pip install llama-index-embeddings-openai pip install -U llama-index-readers-file reader = SimpleDirectoryReader( input_files=["ai.pdf"] )
docs = reader.load_data()
print(f"Loaded {len(docs)} docs")
from llama_index.core.node_parser import (
SentenceSplitter,
SemanticSplitterNodeParser,
)
from llama_index.embeddings.openai import OpenAIEmbedding
#semantic splitter is used here import os
os.environ["OPENAI_API_KEY"] = "key"
embed_model = OpenAIEmbedding()
splitter = SemanticSplitterNodeParser(
buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(docs)
print(nodes[5])
the below line i want to assess
for i, chunk in enumerate(nodes):
print(f"CHUNK {i+1}: ", chunk)// i got the chunks here
so here i have been working with llamaindex semantic chunker. i was able to produce the chunks of whole pdf file but if i have to get the chunks of specific page like say page no. of 5 of file how do i do it.?is there special way i have to treat pdf file?
i tried- for i, chunk in enumerate(nodes[4]): print(f"CHUNK {i+1}: ", chunk) but it produces- metadata about it like file name,embeddings id etc with different different chunk name. chunk1:('id','3235632354') chunk2:('embedding,'None') etc.`