i have a pdf with many tables where i need to extract the tables and save it as dataframe below is the screenshot for your reference
i tried using the pdfplumber
def extract_origin_from_start_section(start_section):
origin = start_section.split(':')[1].split(',')[0].strip()
return origin
def extract_tables_between_sections(pdf_file, start_section, end_section):
tables = []
is_between_sections = False
origin = extract_origin_from_start_section(start_section)
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if start_section in text:
is_between_sections = True
start_idx = text.find(start_section)
text = text[start_idx + len(start_section):]
if is_between_sections:
end_idx = text.find(end_section)
if end_idx != -1:
text = text[:end_idx]
is_between_sections = False
table_data = []
for line in text.strip().split('\n'):
if line.strip() and not line.strip().startswith(('Direct', 'Call')):
if end_section in line:
break
parts = line.split(',')
if len(parts) > 1:
values = [part.strip() for part in parts[1].split() if part.strip()]
table_data.append([parts[0]] + values)
else:
table_data.append(parts[0].split())
table_df = pd.DataFrame(table_data)
table_df['ORIGIN'] = origin
tables.append(table_df)
if tables:
result_df = pd.concat(tables, ignore_index=True)
return result_df
else:
return None
start_section = 'ORIGIN : COCHIN, INDIA(CY)'
end_section = 'ORIGIN : COLOMBO, SRI LANKA(CY)'
tables_between_sections = extract_tables_between_sections('temp.pdf', start_section, end_section)
tables_between_sections