So, currently I have been trying to extract table of content, not just the table of content but extract it differentiating headers from sub-headers and nested sub-headers as well Table of content structure
I have been trying to make a script which makes a dictionary similar to this structure:
{
"headers": {
"Table of Contents": [
{
"name": "Introduction",
"subheaders": [
{
"name": "Permits for the Introduction of Certain Regulated Articles",
"subheaders": [
{
"name": "Regulated Articles Under 7 CFR part 340"
},
{
"name": "Exemptions Under § 340.2(b)"
}
]
},
{
"name": "Other Federal and State Regulations"
},
{
"name": "Containment Facilities"
},
{
"name": "Time Frame for Review and Issuance of a Permit"
},
{
"name": "NEPA Document (EA or EIS)"
}
]
},
{
"name": "Applying for a Permit",
"subheaders": [
{
"name": "Select the Type of Application and Submission Method",
"subheaders": [
{
"name": "Electronic Permit Application"
},
{
"name": "Paper Submission"
}
]
},
{
"name": "Set up an ePermits Account"
},
{
"name": "Helpful Tips for ePermits"
},
{
"name": "Create a Permit Application",
"subheaders": [
{
"name": "Permit or Notification"
},
{
"name": "Select Agent"
},
{
"name": "Submission Method"
},
{
"name": "Application Ownership - Responsible Person or Preparer"
},
{
"name": "Select New Permit, Amendment, Renewal",
"subheaders": [
{
"name": "Amendments"
},
{
"name": "Renewal"
}
]
},
{
"name": "Courtesy Permit"
},
{
"name": "Select the Introduction Type",
"subheaders": [
{
"name": "Number of Releases, Points of Origins, Destinations and Duration"
}
]
},
{
"name": "Select Confidential Business Information (CBI) or No CBI"
}
]
}
]
},
{
"name": "Application Submission - Data Requirements",
"subheaders": [
{
"name": "Provide a CBI Justification Statement"
},
{
"name": "Purpose of Permit",
"subheaders": [
{
"name": "Industrial Product"
},
{
"name": "Pharmaceutical Product"
},
{
"name": "Phytoremediation"
}
]
}
]
}
]
}
}
This is what I've come up with so far, still cant figure out the higher levels of nesting (sub-sub-sub-headers and deeper )
from docx import Document
def detect_headers_and_subheaders(doc):
headers = {}
current_headers = [headers] # Stack to track nested headers
current_levels = [0] # Stack to track current levels
for paragraph in doc.paragraphs:
if paragraph.style.name.startswith('Heading'):
header_text = paragraph.text.strip()
level = int(paragraph.style.name.split(' ')[-1])
if level == 1:
headers[header_text] = {}
current_headers = [headers]
current_levels = [1]
elif level > current_levels[-1]:
current_headers[-1][header_text] = {}
current_headers.append(current_headers[-1][header_text])
current_levels.append(level)
elif level == current_levels[-1]:
current_headers[-2][header_text] = {}
current_headers[-1][header_text] = {}
else:
while level <= current_levels[-1]:
current_headers.pop()
current_levels.pop()
current_headers[-1][header_text] = {}
current_headers.append(current_headers[-1][header_text])
current_levels.append(level)
return headers
def main():
doc = Document('doc_file.docx')
structured_content = detect_headers_and_subheaders(doc)
print(structured_content)
if __name__ == "__main__":
main()
I tried using adobe-extract api with python and parse the json but still facing the same problem.