Is it possible to read table from pdf below a specific text

46 Views Asked by At

i have a pdf with many tables where i need to extract the tables and save it as dataframe below is the screenshot for your reference

input pdf file reference

output needed

Output i am getting

i tried using the pdfplumber

def extract_origin_from_start_section(start_section):
   origin = start_section.split(':')[1].split(',')[0].strip()
   return origin
def extract_tables_between_sections(pdf_file, start_section, end_section):
   tables = []
   is_between_sections = False
   origin = extract_origin_from_start_section(start_section)
   with pdfplumber.open(pdf_file) as pdf:
      for page in pdf.pages:
          text = page.extract_text()  
          if start_section in text:
              is_between_sections = True
              start_idx = text.find(start_section)
              text = text[start_idx + len(start_section):]  
          if is_between_sections:
              end_idx = text.find(end_section)
              if end_idx != -1:
                  text = text[:end_idx]
                  is_between_sections = False
              table_data = []
              for line in text.strip().split('\n'):
                  if line.strip() and not line.strip().startswith(('Direct', 'Call')):
                      if end_section in line:
                          break
                      parts = line.split(',')
                      if len(parts) > 1:
                          values = [part.strip() for part in parts[1].split() if part.strip()]
                          table_data.append([parts[0]] + values)
                      else:
                          table_data.append(parts[0].split())
              table_df = pd.DataFrame(table_data)
              table_df['ORIGIN'] = origin
              tables.append(table_df) 
  if tables:
      result_df = pd.concat(tables, ignore_index=True)
      return result_df
  else:
      return None


start_section = 'ORIGIN : COCHIN, INDIA(CY)'
end_section = 'ORIGIN : COLOMBO, SRI LANKA(CY)'

tables_between_sections = extract_tables_between_sections('temp.pdf', start_section, end_section)

tables_between_sections
0

There are 0 best solutions below