I need help with a LLM project that I am working on. The LLM generates a feedback response and my task is to format the output to JSON. I have written the basic structure of the code that uses a regular expression to fetch heading(issue), node_ids, detailed_description from the textual data to output into a formatted JSON. While this python code extracts the relevant information, the formatting is not right. Can someone help me resolve this issue?
import re
import os
import json
from nltk.tokenize import sent_tokenize
def extract_data(text):
section_pattern = r'\d+\.\s\*(.*?)\((Node ID:.*?)\).*?((?=\d+\.\s*\*)|$)|\-\s(.*?)\n\n'
section_regex = re.compile(section_pattern, re.MULTILINE | re.DOTALL)
matches = section_regex.findall(text)
data = []
# print(matches)
for match in matches:
heading = match[0].strip()
node_ids = re.findall(r'\d+:\d+', match[1])
detailed_desc = extract_detailed_desc(match[3].strip())
data.append({
'issue': heading,
'node_ids': node_ids,
'detailed_feedback': detailed_desc
})
return data
def extract_detailed_desc(text):
sentences = sent_tokenize(text)
detailed_desc = []
for sentence in sentences:
detailed_desc.append(sentence.strip('-').strip())
return detailed_desc
def main():
txt_file_path = "./FormatOutput/sample.txt"
if os.path.exists(txt_file_path):
try:
with open(txt_file_path, 'r') as txt_file:
data = txt_file.read()
# print(data)
except Exception as e:
print("Error occurred while reading the text file:", e)
else:
print("File not found:", txt_file_path)
structured_data = extract_data(data)
json_data = json.dumps(structured_data, indent=4)
print(json_data)
if __name__ == "__main__":
main()
The desired output should look something like this:
[
{
"issue": "Content Clarity and Structure",
"node_ids": ["117:55", "117:135"],
"detailed_feedback": [
"Combine the text into a clear paragraph explaining the service's purpose, benefits, functionality, and value proposition.",
"Refine the mission statement to directly address customer pain points the service solves."
]
},
{
"issue": "Call to Action (CTA) Optimization",
"node_ids": ["117:38", "117:89"],
"detailed_feedback": [
"Revise the 'BUY NOW' CTA to include the service name or offer (e.g., 'Get [Service Name] Now').",
"Modify CTAs to create urgency: 'Start Free Trial Today' & 'Secure Your System Now'.",
"Streamline CTAs by combining similar actions and differentiating trial and purchase options.",
"Adjust 'DOWNLOAD FREE' button color for better visibility (lighter shade or contrasting color)."
]
},
{
"issue": "Headline and Introduction Enhancement",
"node_ids": ["117:27"],
"detailed_feedback": [
"Increase headline text size and weight for prominence on the product image.",
"Use bullet points with larger font size or contrasting color to highlight features."
]
}
]
I changed the structure of the code. Now it is giving output in right formatting.