I am trying to extract texts from PDF and compare the info, finally saving it as an excel file. But while I am running it, (the code is given below), I get the error. I have provided the whole Traceback.
`
import pdfminer
import pandas as pd
from time import sleep
from tqdm import tqdm
from itertools import chain
import slate
# List of pdf files to process
pdf_files = ['file1.pdf', 'file2.pdf']
# Create a list to store the text from each PDF
pdf1_text = []
pdf2_text = []
# Iterate through each pdf file
for pdf_file in tqdm(pdf_files):
# Open the pdf file
with open(pdf_file, 'rb') as pdf_now:
# Extract text using slate
text = slate.PDF(pdf_now)
text = text[0].split('\n')
if pdf_file == pdf_files[0]:
pdf1_text.append(text)
else:
pdf2_text.append(text)
sleep(20)
pdf1_text = list(chain.from_iterable(pdf1_text))
pdf2_text = list(chain.from_iterable(pdf2_text))
differences = set(pdf1_text).symmetric_difference(pdf2_text)
## Create a new dataframe to hold the differences
differences_df = pd.DataFrame(columns=['pdf1_text', 'pdf2_text'])
# Iterate through the differences and add them to the dataframe
for difference in differences:
# Create a new row in the dataframe with the difference from pdf1 and pdf2
differences_df = differences_df.append({'pdf1_text': difference if difference in pdf1_text else '',
'pdf2_text': difference if difference in pdf2_text else ''}, ignore_index=True)
# Write the dataframe to an excel sheet
differences_df = differences_df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
differences_df.to_excel('differences.xlsx', index=False, engine='openpyxl')
import openpyxl
import re
# Load the Excel file into a dataframe
df = pd.read_excel("differences.xlsx")
# Create a condition to check the number of words in each cell
for column in ["pdf1_text", "pdf2_text"]:
df[f"{column}_word_count"] = df[column].str.split().str.len()
condition = df[f"{column}_word_count"] < 10
# Drop the rows that meet the condition
df = df[~condition]
for column in ["pdf1_text", "pdf2_text"]:
df = df.drop(f"{column}_word_count", axis=1)
# Save the modified dataframe to a new Excel file
df.to_excel("differences.xlsx", index=False)
This is my code, and below is the error which I am getting. Listing the whole traceback below -
Traceback (most recent call last):
File "c:\Users\lmohandas\stuff\1801pdfs\slatetrial.py", line 22, in <module>
text = slate.PDF(pdf_now)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\slate\classes.py", line 61, in __init__
self.doc = PDFDocument(self.parser, password)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 558, in __init__
self.read_xref_from(parser, pos, self.xrefs)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 789, in read_xref_from
xref.load(parser)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 242, in load
self.data = stream.get_data()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 292, in get_data
self.decode()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 283, in decode
data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\utils.py", line 46, in apply_png_predictor
raise ValueError("Unsupported predictor value: %d"%ft)
TypeError: %d format: a real number is required, not bytes