I am extracting sentences from pdfs into an excel spreadsheet. It handles most cases fine, but when there is a dialogue which ends in ? or ! and is followed by something like 'he said.' it splits the quote and the 'he said' into separate sentences.
For example: "Ingaba ibhasi isishiyile?" babebuza. which means "Has the bus left?" they asked. gets split into "Ingaba ibhasi isishiyile?" and babebuza.
How can I fix my code so it doesn't split these type of sentences? Note that it's not just in the case of babebuza (they asked) it happens with all cases where the quotes ends in ? or !
This is what I have tried but no luck so far.
I tried to code for instances where a sentence ends in ?" or !" and is followed by a word/sentence that starts with a lower case letter- so that these get treated as one sentence, but this didn't give the desired outcome.
import pdfplumber
import pandas as pd
import nltk
import re
import glob
def exclude_page_numbers(page_text, page_elements, exclusion_regions):
filtered_text = ""
for element in page_elements:
text = element["text"]
x, y = element["x0"], element["top"] # Get the x and y coordinates of the element
# Check if the element falls within any exclusion region
exclude = False
for region in exclusion_regions:
x_min, y_min, x_max, y_max = region
if x_min <= x <= x_max and y_min <= y <= y_max:
exclude = True
break
if not exclude and not any(char.isdigit() for char in text) and not text.strip().isdigit():
filtered_text += text + " " # Add a space after each element
return filtered_text
def extract_sentences(text):
# Replace “ and ” with normal double quotes "
text = text.replace("“", '"').replace("”", '"')
# Replace ‘ and ’ with normal single quotes '
text = text.replace("‘", "'").replace("’", "'")
# Split text on occurrences of `.`, `!`, or `?` while considering quotation marks within the sentence
sentences = re.split(r'(?<=[.!?])(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
sentences = [sentence.strip() for sentence in sentences]
# Merge the closing quotation mark with the preceding sentence
merged_sentences = []
i = 0
while i < len(sentences):
if sentences[i].endswith('"'):
if i + 1 < len(sentences):
merged_sentence = sentences[i] + ' ' + sentences[i + 1]
merged_sentences.append(merged_sentence)
i += 2
else:
merged_sentences.append(sentences[i])
i += 1
else:
merged_sentences.append(sentences[i])
i += 1
# Join adjacent sentences if the first sentence ends with ?" or !" and the second sentence starts with a lowercase letter
final_sentences = []
i = 0
while i < len(merged_sentences):
if i + 1 < len(merged_sentences):
first_sentence = merged_sentences[i]
second_sentence = merged_sentences[i + 1]
if re.search(r'[?!]"?\s*$', first_sentence) and re.search(r'^[a-z]', second_sentence):
joined_sentence = first_sentence + ' ' + second_sentence
final_sentences.append(joined_sentence)
i += 2
continue
final_sentences.append(merged_sentences[i])
i += 1
# Tokenize final sentences using nltk.sent_tokenize
tokenized_sentences = []
for sentence in final_sentences:
tokenized_sentences.extend(nltk.sent_tokenize(sentence))
return tokenized_sentences
def extract_text_from_pdf(pdf_file_path):
with pdfplumber.open(pdf_file_path) as pdf:
num_pages = len(pdf.pages)
text = ""
for page in range(3, num_pages - 2):
page_obj = pdf.pages[page]
page_text = page_obj.extract_text()
page_elements = page_obj.extract_words() # Obtain the text elements and their positions
filtered_text = exclude_page_numbers(page_text, page_elements, exclusion_regions)
def scrape_sentences_to_excel(pdf_file_paths, excel_file_path):
sentences = []
for pdf_file_path in pdf_file_paths:
# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_file_path)
# Extract sentences from the text
extracted_sentences = extract_sentences(pdf_text)
sentences.extend(extracted_sentences)
# Create a pandas DataFrame with a single column for sentences
df = pd.DataFrame({'Sentences': sentences})
# Write the DataFrame to an Excel file
df.to_excel(excel_file_path, index=False)
print("Sentences extracted and saved to Excel successfully!")
pdf_folder_path = 'C:/Users/Paige Cox/Desktop/Little Zebra Books/pdfs' # Folder path containing PDFs
excel_file_path = 'C:/Users/Paige Cox/Desktop/Little Zebra Books/pdfs/sentence_corpus_xhosa.xlsx' # Output Excel file path
# Specify the exclusion regions as (x_min, y_min, x_max, y_max)
exclusion_regions = [(42, 558, 359, 559)] # Add your exclusion regions here
# Get a list of PDF files in the folder
pdf_file_paths = glob.glob(pdf_folder_path + '/*.pdf')
# Scrape sentences from PDFs and save to Excel
scrape_sentences_to_excel(pdf_file_paths, excel_file_path)