I have many question and solution files in pdf format. For each file there corresponds a questions-solution file pair. I am trying to prepare a dataset to practice questions and solutions. But to my surprise, I am getting solution and questions from different pairs. Because of this, the solution and questions are not matching. Can somebody please help to let me know what is the mistake I am doing. The conversion of filename is correct and the file exists for both Question and Answer. I am using pdftotext to convert pdf to text.
The text I am getting as answer does not exist in the solution file. I have tried to save the file2 name as well, it is also correct. Also, I am getting error for each pdf.
"poppler/error: Couldn't find group for reference"
Below is my code:-
def check_ans(Question, Answer):
if Question in Answer:
Answer = Answer.replace(Question,"")
return Answer
def parse(string):
string = str(string)
string = re.sub(r'[^\x00-\x7F]+', '', string)
string = re.sub(r'[ \t]+', ' ', string)
string = re.sub(r'^\s*\n', '', string, flags=re.MULTILINE)
string = string.replace('\x0C', '')
return string
def read_pdf_with_2_columns(pdf_path, strings, substrings):
with open(pdf_path, "rb") as f:
pdf = pdftotext.PDF(f)
text=""
pages = list(pdf)
for page in pages:
page_text = page
page_text = re.sub(r'\s+$', '', page_text)
splits = page_text.split("\n")
max_length = 0
for split in splits:
replace=False
for string in strings:
if string in split.lower():
page_text = page_text.replace(split, "")
replace=True
break
if(replace==False):
if(max_length<len(split)):
max_length=len(split)
half_length = int((max_length+1)/2)
left =""
right=""
splits = page_text.split("\n")
for split in splits:
left_strip = split[:half_length].strip()
if(len(left_strip)>0):
equal=False
for string in substrings:
if(left_strip==string):
equal=True
if(equal==False):
left=left+left_strip+"\n"
right_strip = split[half_length:].strip()
if(len(right_strip)>0):
equal=False
for string in substrings:
if(right_strip==string):
equal=True
if(equal==False):
right=right+right_strip+"\n"
text = text+"\n"+re.sub(r'[^\x00-\x7F]+', '', left)+re.sub(r'[^\x00-\x7F]+', '', right)
return parse(text)
def read_pdf_with_1_column(pdf_path, strings, substrings):
with open(pdf_path, "rb") as f:
pdf = pdftotext.PDF(f)
text=""
pages = list(pdf)
for page in pages:
page_text = page
splits = page_text.split("\n")
for split in splits:
for string in strings:
if string in split.lower():
page_text = page_text.replace(split, "")
break
text = text+"\n"+re.sub(r'[^\x00-\x7F]+', '', page_text)
return parse(text)
def add_to_df(file, dirname, df, strings, substrings, columns_q, columns_a, start_char_q, end_char_q, start_char_a, end_char_a, source, subject):
file2 = re.sub(r'(T\d+)Q', r'\1S', file)
if(os.path.isfile(dirname+"\\"+file2)):
if(columns_q):
questions_text = read_pdf_with_2_columns(file, strings, substrings)
else:
questions_text = read_pdf_with_1_column(file, strings, substrings)
if(columns_a):
answers_text = read_pdf_with_2_columns(file2, strings, substrings)
else:
answers_text = read_pdf_with_1_column(file2, strings, substrings)
ques_count=0
ans_count=0
for i in range(10, 101):
ques=""
ques_split=questions_text.split(start_char_q+str(i)+end_char_q)
if(len(ques_split)>1):
ques_split = ques_split[1].split(start_char_q+str(i+1)+end_char_q)
if(len(ques_split)>1):
ques = ques_split[0]
if(len(ques)>40):
ques_count+=1
ans_split=answers_text.split(start_char_a+str(i)+end_char_a)
if(len(ans_split)>1):
ans_split = ans_split[1].split(start_char_a+str(i+1)+end_char_a)
if(len(ans_split)>1):
ans = ans_split[0]
ans_count+=1
df.loc[len(df.index)] = [source, subject, file, file2, ques, ans]
return df