pdf to text reading from different file

46 Views Asked by At

I have many question and solution files in pdf format. For each file there corresponds a questions-solution file pair. I am trying to prepare a dataset to practice questions and solutions. But to my surprise, I am getting solution and questions from different pairs. Because of this, the solution and questions are not matching. Can somebody please help to let me know what is the mistake I am doing. The conversion of filename is correct and the file exists for both Question and Answer. I am using pdftotext to convert pdf to text.

The text I am getting as answer does not exist in the solution file. I have tried to save the file2 name as well, it is also correct. Also, I am getting error for each pdf.

"poppler/error: Couldn't find group for reference"

Below is my code:-

def check_ans(Question, Answer):
    if Question in Answer:
        Answer = Answer.replace(Question,"")
    return Answer

def parse(string):
    string = str(string)
    string = re.sub(r'[^\x00-\x7F]+', '', string)
    string = re.sub(r'[ \t]+', ' ', string)
    string = re.sub(r'^\s*\n', '', string, flags=re.MULTILINE)
    string = string.replace('\x0C', '')
    return string

def read_pdf_with_2_columns(pdf_path, strings, substrings):
    with open(pdf_path, "rb") as f:
        pdf = pdftotext.PDF(f)
        text=""
        pages = list(pdf)
        for page in pages:
            page_text = page
            page_text =  re.sub(r'\s+$', '', page_text)
            splits = page_text.split("\n")
            max_length = 0
            for split in splits:
                replace=False
                for string in strings:
                    if string in split.lower():
                        page_text = page_text.replace(split, "")
                        replace=True
                        break
                if(replace==False):
                    if(max_length<len(split)):
                        max_length=len(split)
            half_length = int((max_length+1)/2)
            left =""
            right=""
            splits = page_text.split("\n")
            for split in splits:
                left_strip = split[:half_length].strip()
                if(len(left_strip)>0):
                    equal=False
                    for string in substrings:
                        if(left_strip==string):
                            equal=True
                    if(equal==False):
                        left=left+left_strip+"\n"
                right_strip = split[half_length:].strip()
                if(len(right_strip)>0):
                    equal=False
                    for string in substrings:
                        if(right_strip==string):
                            equal=True
                    if(equal==False):
                        right=right+right_strip+"\n"
            text = text+"\n"+re.sub(r'[^\x00-\x7F]+', '', left)+re.sub(r'[^\x00-\x7F]+', '', right)
    return parse(text)

def read_pdf_with_1_column(pdf_path, strings, substrings):
    with open(pdf_path, "rb") as f:
        pdf = pdftotext.PDF(f)
        text=""
        pages = list(pdf)
        for page in pages:
            page_text = page
            splits = page_text.split("\n")
            for split in splits:
                for string in strings:
                    if string in split.lower():
                        page_text = page_text.replace(split, "")
                        break
            text = text+"\n"+re.sub(r'[^\x00-\x7F]+', '', page_text)
        return parse(text)

def add_to_df(file, dirname, df, strings, substrings, columns_q, columns_a, start_char_q, end_char_q, start_char_a, end_char_a, source, subject):
    file2 = re.sub(r'(T\d+)Q', r'\1S', file)
    if(os.path.isfile(dirname+"\\"+file2)):
        if(columns_q):
            questions_text = read_pdf_with_2_columns(file, strings, substrings)
        else:
            questions_text = read_pdf_with_1_column(file, strings, substrings)
        if(columns_a):
            answers_text = read_pdf_with_2_columns(file2, strings, substrings)
        else:
            answers_text = read_pdf_with_1_column(file2, strings, substrings)
        ques_count=0
        ans_count=0
        for i in range(10, 101):
            ques=""
            ques_split=questions_text.split(start_char_q+str(i)+end_char_q)
            if(len(ques_split)>1):
                ques_split = ques_split[1].split(start_char_q+str(i+1)+end_char_q)
            if(len(ques_split)>1):
                ques = ques_split[0]
            if(len(ques)>40):
                ques_count+=1
                ans_split=answers_text.split(start_char_a+str(i)+end_char_a)
                if(len(ans_split)>1):
                    ans_split = ans_split[1].split(start_char_a+str(i+1)+end_char_a)
                    if(len(ans_split)>1):
                        ans = ans_split[0]
                        ans_count+=1
                        df.loc[len(df.index)] = [source, subject, file, file2, ques, ans]
    return df
0

There are 0 best solutions below