I'm working on a project which I need to extract the comments on a PDF file and sort them based on their issuing date and their replies (if there's any). Currently I'm using PdfReader from pypdf library which is working great for extracting the comments but I'm identifying the replies based on their positions (using "/Rect" to compare and find out if a comment is a reply to specific comment). The method seems to be working fine most of the time but, I was wondering if there's a smarter way of identifying the replies where I can be more sure of the procedure. Tried to use "/P" from the properties PdfReader but it seems to be identical for all comments on the same page.
Thanks a lot!
This is how I make my main df to process the properties:
src = 'test.pdf'
input1 = PdfReader(src)
nPages = len(input1.pages)
df_comments = pd.DataFrame()
for i in range(nPages) :
annotation = []
page = []
page0 = input1.pages[i]
try :
for annot in page0['/Annots'] :
annotation.append(annot.get_object())
page = [i+1] * len(annotation)
page = pd.DataFrame(page)
annotation = pd.DataFrame(annotation)
df_temp = pd.concat([page, annotation], axis=1)
df_comments = pd.concat([df_comments, df_temp], ignore_index=True)
except :
# there are no annotations on this page
pass
and this is how I extract the replies and sort them (there should be a smarter way!):
#creating lists for comments and replies
cmnt_list = []
reply_list = []
page_final = []
comment_final = []
Author_final = []
Creation_date_final = []
data2 = {'page': [],
'loc_x0': [],
'loc_y0': [],
'loc_x1': [],
'loc_y1': [],
'comment': [],
'reply': [],
'Author': [],
'Creation_date': []}
cmnt_df_dummy = pd.DataFrame(data2)
cmnt_reply = pd.DataFrame(data2)
# cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df.iloc[0:2]])
unique, counts = np.unique(cmnt_df["page"], return_counts=True)
for i, page_no in enumerate(unique):
if counts[i] == 1:
# Update everything, this is a page with single comment without reply!
cmnt_reply = pd.concat([cmnt_reply, cmnt_df[cmnt_df['page'] == page_no]])
else:
cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df[cmnt_df['page'] == unique[i]]])
#cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df[cmnt_df['page'] == 91]])
j=0
remove_list = [0]
while len(cmnt_df_dummy['page']) > 1:
#for j in range(len(cmnt_df_dummy['page'])-1):
#if j ==0:
if len(remove_list) ==1:
cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[0:1]])
remove_list = [0]
for k in range(len(cmnt_df_dummy['page'])-1):
test1 = bool(abs((cmnt_df_dummy['loc_x0'].iloc[0] - cmnt_df_dummy['loc_x0'].iloc[k+1])/cmnt_df_dummy['loc_x0'].iloc[0]*100)<0.002)
test2 = bool(abs((cmnt_df_dummy['loc_y0'].iloc[0] - cmnt_df_dummy['loc_y0'].iloc[k+1])/cmnt_df_dummy['loc_y0'].iloc[0]*100)<0.002)
test3 = bool(abs((cmnt_df_dummy['loc_x1'].iloc[0] - cmnt_df_dummy['loc_x1'].iloc[k+1])/cmnt_df_dummy['loc_x1'].iloc[0]*100)<0.002)
test4 = bool(abs((cmnt_df_dummy['loc_y1'].iloc[0] - cmnt_df_dummy['loc_y1'].iloc[k+1])/cmnt_df_dummy['loc_y1'].iloc[0]*100)<0.002)
test_sum = test1 + test2 + test3 + test4
if test_sum >= 2:
#This is a reply! Collect it and go for the next ones!
cmnt_df_dummy.iat[k+1, 6] = cmnt_df_dummy.iat[k+1, 5]
cmnt_df_dummy.iat[k+1, 5] = None
cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[k+1:k+2]])
remove_list = np.append(remove_list, k+1)
#remove the first row in order to go for the next round of investigation
cmnt_df_dummy.drop(cmnt_df_dummy.index[remove_list],axis=0,inplace=True)
if len(remove_list) > 1:
cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[0:1]])
#remove the reply row
j = j+1
cmnt_df_dummy = pd.DataFrame(data2)