Can someone help me to extract text from pdf file with in a given range of page numbers(eg: text from page 31 to page 39)

120 Views Asked by At
import PyPDF2
pdfFileObj = open('C:\\sem1\\691-project\\Dataset\\Maths\\A Spiral Workbook for Discrete Mathematics.pdf', 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
out_file = open('C:\\sem1\\691-project\\Dataset\\Maths\\A Spiral Workbook for Discrete Mathematics.txt', 'a')
for pageObj in pdfReader.pages:
    page_text = pageObj.extract_text()
    print(page_text)
    out_file.write(page_text)
out_file.close()
pdfFileObj.close()

Am able to extract text from whole book. Rather I need text only from selected page numbers or selected range.

3

There are 3 best solutions below

0
Achraf Ben Salah On BEST ANSWER

You can try like this :

import PyPDF2

start_page = 31  # Specify the starting page number
end_page = 39  # Specify the ending page number

pdf_file_path = 'C:\\sem1\\691-project\\Dataset\\Maths\\A Spiral Workbook for Discrete Mathematics.pdf'
output_file_path = 'C:\\sem1\\691-project\\Dataset\\Maths\\A Spiral Workbook for Discrete Mathematics.txt'

with open(pdf_file_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    with open(output_file_path, 'a') as output_file:
        for page_number in range(start_page - 1, end_page):
            page_obj = pdf_reader.pages[page_number]
            page_text = page_obj.extract_text()
            print(page_text)
            output_file.write(page_text)

The range() function is used to obtain the page numbers in the specified range, and then the corresponding page objects are extracted using pdf_reader.pages[page_number]. The extracted text is then written to the output_file.

0
Prudhviraj Panisetti On

Actually, you can modify your code to iterate through the pages and extract text only for the pages within the desired range

import PyPDF2


pdf_file_path = 'C:/sem1/691-project/Dataset/Maths/A Spiral Workbook for Discrete Mathematics.pdf'


txt_file_path = 'C:/sem1/691-project/Dataset/Maths/Extracted_Pages.txt'


start_page = 31
end_page = 39


pdf_file = open(pdf_file_path, 'rb')


pdf_reader = PyPDF2.PdfReader(pdf_file)


with open(txt_file_path, 'a', encoding='utf-8') as txt_file:
    
    for page_number in range(start_page, min(end_page + 1, len(pdf_reader.pages) + 1)):
        
        page = pdf_reader.pages[page_number - 1]  
        page_text = page.extract_text()
        
        
        txt_file.write(f"Page {page_number}:\n")
        txt_file.write(page_text)
        txt_file.write('\n\n')


pdf_file.close()

print(f"Text extracted from pages {start_page} to {end_page} and saved to '{txt_file_path}'.")
0
AKX On

Use .getPage() instead, e.g.

import pathlib

import PyPDF2

directory = pathlib.Path("C:\\sem1\\691-project\\Dataset\\Maths")
source_file = directory / "A Spiral Workbook for Discrete Mathematics.pdf"
with open(source_file, "rb") as fp:
    pdfReader = PyPDF2.PdfReader(fp)
    with open(source_file.with_suffix(".txt"), "a") as out_file:
        for i in range(3, 11):  # pages 3..10
            out_file.write(pdfReader.getPage(i - 1).extractText())

(I also took the liberty of deduplicating your path-handling code a bit.)