Extracting text from a PDF - python

143 Views Asked by At

I am new to Python and I am developing a program that takes a PDF file as input and converts it into text, I am using Python 3 and tried the PyPDF2 and PDFMiner.six packages.

For the first PDF file it convert well and print it on console, but when I use another PDF file which contains some empty pages, it prints out the text, and when it reach the empty page, these error appears,

Here is the code:

PyPDF2

import PyPDF2
 def extract_txt_pdf(pdf_file:str) ->[str]:
  # open the file and read it as bit
    with open(pdf_file, 'rb') as pdf:
      reader = PyPDF2.PdfReader(pdf,strict = False)
      pdf_text= []

    for page in reader.pages:
        content = page.extract_text()
        pdf_text.append(content)

    return pdf_text 

PDFMiner.six

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
  
   for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, 
      password=password,caching=caching, check_extractable=True):
    interpreter.process_page(page)

    text = retstr.getvalue()

fp.close()
device.close()
retstr.close()
return text

and Here is the error for pdfminer

  print(t)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2032.0_x64__qbz5n2kfra8p0\Lib\encodings\cp1252.py", line 19, in encode  return codecs.charmap_encode(input,self.errors,encoding_table)[0]
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  UnicodeEncodeError: 'charmap' codec can't encode character '\u25b6' in position 0: 
 character maps to <undefined>
0

There are 0 best solutions below