Sentence tokenization splits sentences where there is dialogue followed by 'he/she said', how can I fix this?

40 Views Asked by Paige Cox At 11 May 2023 at 12:34

I am extracting sentences from pdfs into an excel spreadsheet. It handles most cases fine, but when there is a dialogue which ends in ? or ! and is followed by something like 'he said.' it splits the quote and the 'he said' into separate sentences.

For example: "Ingaba ibhasi isishiyile?" babebuza. which means "Has the bus left?" they asked. gets split into "Ingaba ibhasi isishiyile?" and babebuza.

How can I fix my code so it doesn't split these type of sentences? Note that it's not just in the case of babebuza (they asked) it happens with all cases where the quotes ends in ? or !

This is what I have tried but no luck so far.

I tried to code for instances where a sentence ends in ?" or !" and is followed by a word/sentence that starts with a lower case letter- so that these get treated as one sentence, but this didn't give the desired outcome.

import pdfplumber
import pandas as pd
import nltk
import re
import glob

def exclude_page_numbers(page_text, page_elements, exclusion_regions):
    filtered_text = ""

    for element in page_elements:
        text = element["text"]
        x, y = element["x0"], element["top"]  # Get the x and y coordinates of the element

        # Check if the element falls within any exclusion region
        exclude = False
        for region in exclusion_regions:
            x_min, y_min, x_max, y_max = region
            if x_min <= x <= x_max and y_min <= y <= y_max:
                exclude = True
                break

        if not exclude and not any(char.isdigit() for char in text) and not text.strip().isdigit():
            filtered_text += text + " "  # Add a space after each element

    return filtered_text


def extract_sentences(text):
    # Replace “ and ” with normal double quotes "
    text = text.replace("“", '"').replace("”", '"')
       
    # Replace ‘ and ’ with normal single quotes '
    text = text.replace("‘", "'").replace("’", "'")

    # Split text on occurrences of `.`, `!`, or `?` while considering quotation marks within the sentence
    sentences = re.split(r'(?<=[.!?])(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
    sentences = [sentence.strip() for sentence in sentences]

    # Merge the closing quotation mark with the preceding sentence
    merged_sentences = []
    i = 0
    while i < len(sentences):
        if sentences[i].endswith('"'):
            if i + 1 < len(sentences):
                merged_sentence = sentences[i] + ' ' + sentences[i + 1]
                merged_sentences.append(merged_sentence)
                i += 2
            else:
                merged_sentences.append(sentences[i])
                i += 1
        else:
            merged_sentences.append(sentences[i])
            i += 1

    # Join adjacent sentences if the first sentence ends with ?" or !" and the second sentence starts with a lowercase letter
    final_sentences = []
    i = 0
    while i < len(merged_sentences):
        if i + 1 < len(merged_sentences):
            first_sentence = merged_sentences[i]
            second_sentence = merged_sentences[i + 1]
            if re.search(r'[?!]"?\s*$', first_sentence) and re.search(r'^[a-z]', second_sentence):
                joined_sentence = first_sentence + ' ' + second_sentence
                final_sentences.append(joined_sentence)
                i += 2
                continue
        final_sentences.append(merged_sentences[i])
        i += 1

    # Tokenize final sentences using nltk.sent_tokenize
    tokenized_sentences = []
    for sentence in final_sentences:
        tokenized_sentences.extend(nltk.sent_tokenize(sentence))

    return tokenized_sentences

def extract_text_from_pdf(pdf_file_path):
    with pdfplumber.open(pdf_file_path) as pdf:
        num_pages = len(pdf.pages)
        text = ""
        for page in range(3, num_pages - 2):
            page_obj = pdf.pages[page]
            page_text = page_obj.extract_text()
            page_elements = page_obj.extract_words()  # Obtain the text elements and their positions
            filtered_text = exclude_page_numbers(page_text, page_elements, exclusion_regions)
   def scrape_sentences_to_excel(pdf_file_paths, excel_file_path):
    sentences = []
    for pdf_file_path in pdf_file_paths:
        # Extract text from PDF
        pdf_text = extract_text_from_pdf(pdf_file_path)

        # Extract sentences from the text
        extracted_sentences = extract_sentences(pdf_text)
        sentences.extend(extracted_sentences)

    # Create a pandas DataFrame with a single column for sentences
    df = pd.DataFrame({'Sentences': sentences})

    # Write the DataFrame to an Excel file
    df.to_excel(excel_file_path, index=False)

    print("Sentences extracted and saved to Excel successfully!")

pdf_folder_path = 'C:/Users/Paige Cox/Desktop/Little Zebra Books/pdfs'  # Folder path containing PDFs
excel_file_path = 'C:/Users/Paige Cox/Desktop/Little Zebra Books/pdfs/sentence_corpus_xhosa.xlsx'  # Output Excel file path

# Specify the exclusion regions as (x_min, y_min, x_max, y_max)
exclusion_regions = [(42, 558, 359, 559)]  # Add your exclusion regions here

# Get a list of PDF files in the folder
pdf_file_paths = glob.glob(pdf_folder_path + '/*.pdf')

# Scrape sentences from PDFs and save to Excel
scrape_sentences_to_excel(pdf_file_paths, excel_file_path)

Original Q&A

Sentence tokenization splits sentences where there is dialogue followed by 'he/she said', how can I fix this?

There are 0 best solutions below

Related Questions in WEB-SCRAPING

Related Questions in NLP

Related Questions in NLTK

Related Questions in TOKENIZE

Related Questions in CORPUS

Trending Questions

Popular # Hahtags

Popular Questions