I am trying to convert some HTML into reportlab syntax so that I can generate a PDF. I have the following code which handles the conversion, but I when giving it nested lists, the item right before the nested list starts gets ommited in the final PDF. I have been trying for hours to fix this and I can't seem to understand why it happens.
import xml.sax as sax
from reportlab.platypus import Paragraph, PageBreak, Table, Preformatted, Image, Spacer
from reportlab.lib.pagesizes import A4
from bs4 import BeautifulSoup
import markdown2
import requests
from io import BytesIO
from PIL import Image as PILImage
import copy
def getColumnWidths(pageSize, pageMargin, columnCount):
"""Calculate the width of each column based on the page margin and number of columns"""
available_width = pageSize[0] - (pageMargin + pageMargin)
column_widths = [available_width / columnCount] * columnCount
return column_widths
def html_to_rl(html, styleSheet, tableStyle, requestHeaders):
converter = markdown2.Markdown(extras=["tables", "fenced-code-blocks"])
html = BeautifulSoup(converter.convert(html), "html.parser").prettify()
print(html)
elements = list()
class Handler(sax.ContentHandler):
mode = ""
buffer = ""
listcounter = 0
nesting_level = 0
listtype = ""
in_link = False
elements = []
list_stack = []
in_pre = False # Flag to indicate if inside a <pre> block (used for code blocks)
in_code = False
link_href = ""
current_table_data = [] # Holds data for the current table being processed
in_table = False # Flag to indicate if inside a <table>
def download_image(self, url, headers):
"""Download an image from the given URL and return the image data."""
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200:
image_data = BytesIO(response.content)
return image_data
else:
raise Exception("Failed to download image")
def calculate_max_dimensions(self, page_size=A4, left_margin=40, right_margin=40, top_margin=80, bottom_margin=80):
"""Calculate the maximum dimensions available for the image."""
max_width = page_size[0] - (left_margin + right_margin)
max_height = page_size[1] - (top_margin + bottom_margin)
return max_width, max_height
def adjust_image_size(self, imageData, max_width, max_height):
"""
Adjust the image size to fit within max_width and max_height while maintaining aspect ratio.
This will add overhead but without this ReportLab will complain about image being too big to fit within the constraints
"""
with PILImage.open(imageData) as img:
original_width, original_height = img.size
ratio = min(max_width / original_width, max_height / original_height)
return int(original_width * ratio), int(original_height * ratio)
def startElement(self, name, attrs):
if name in ["strong", "em", "i", "b"]:
self.mode = name
elif name == "a":
self.in_link = True
self.link_href = attrs.get('href', '')
self.buffer += f'<link href="{self.link_href}">'
elif name in ["ol", "ul"]:
# New list context
list_type = name
list_counter = 0 # Initialize counter for new list
self.list_stack.append({"type": list_type, "counter": list_counter})
elif name == "li":
# Handle list item based on current list context
print(self.buffer)
list_context = self.list_stack[-1] if self.list_stack else None
if list_context:
list_type = list_context["type"]
list_counter = list_context["counter"] + 1 # Increment counter
self.buffer = f"{list_counter}. " if list_type == "ol" else "• " # Set prefix
print(self.buffer)
list_context["counter"] = list_counter # Update counter in stack
elif name == "hr":
elements.append(PageBreak())
elif name == "pre":
self.in_pre = True
self.buffer = "" # Clear le buffer
elif name == "img":
src = attrs.get('src', '') # Get image URL
imageData = self.download_image(src, requestHeaders)
imageWidth, imageHeight = self.adjust_image_size(imageData, *self.calculate_max_dimensions())
elements.append(Image(imageData, width=imageWidth, height=imageHeight))
if name == "table":
self.in_table = True
self.current_table_data = []
elif name == "tr" and self.in_table:
self.current_table_data.append([])
elif name in ["td", "th"] and self.in_table:
self.buffer = ""
if name == "code" and not self.in_pre:
self.in_code = True
self.buffer += "<font name='Courier' size='11' color='#e1665d'>" # Add opening <code> tag to buffer
def endElement(self, name):
if name == "a":
self.buffer += '</link>'
self.in_link = False
self.link_href = ""
elif name.startswith("h") and name[-1] in ["1", "2", "3", "4", "5", "6"]:
elements.append(Paragraph(self.buffer, styleSheet["Heading%s" % name[-1]]))
self.buffer = ""
elif name in ["strong", "em", "i", "b"]:
self.mode = ""
elif name == "p":
elements.append(Paragraph(self.buffer, styleSheet["BodyText"]))
self.buffer = ""
elif name in ["ol", "ul"]:
# End of list context
self.list_stack.pop()
elif name == "li":
# Process list item content
if len(self.list_stack) > 0: # Check if nested
list_style = copy.deepcopy(styleSheet["BodyText"])
indentation = 24 * (len(self.list_stack) - 1)
list_style.leftIndent = indentation + 24
if self.buffer.strip() == "":
self.buffer = "" # Remove empty text
elements.append(Paragraph(self.buffer, list_style))
self.buffer = ""
else: # Outermost list item
list_style = copy.deepcopy(styleSheet["BodyText"])
elements.append(Paragraph(self.buffer, list_style))
self.buffer = ""
elif name == "pre":
self.in_pre = False
elements.append(Spacer(1, 10))
elements.append(Preformatted(self.buffer, styleSheet["Code"], maxLineLength=80, newLineChars=''))
elements.append(Spacer(1, 10))
self.buffer = ""
if name == "code" and not self.in_pre:
self.buffer += "</font>" # End inline code styling
self.in_code = False
if name in ["td", "th"] and self.in_table:
self.current_table_data[-1].append(self.buffer)
self.buffer = "" # Reset buffer after capturing cell content
# Handle table end, create and append ReportLab Table element
if name == "table" and self.in_table:
# Convert non-header cell contents to Paragraphs for better formatting
print(self.current_table_data)
table_data_formatted = [
[cell if idx == 0 else Paragraph(cell, styleSheet["Normal"]) for cell in row]
for idx, row in enumerate(self.current_table_data)
]
# Assume table column widths are evenly distributed; adjust as needed
table_column_widths = getColumnWidths(A4, 40, len(self.current_table_data[0]))
reportlab_table = Table(table_data_formatted, colWidths=table_column_widths)
reportlab_table.setStyle(tableStyle)
elements.append(reportlab_table)
elements.append(Spacer(1, 10))
self.in_table = False # Reset table handling flags
def characters(self, chars):
if self.in_code:
chars = chars.replace("&", "&") # Must be done first!
chars = chars.replace("<", "<")
chars = chars.replace(">", ">")
chars = chars.replace('"', """)
chars = chars.replace("'", "'")
chars = chars.rstrip()
if self.in_table:
chars = chars.strip() # Remove newlines / trailing spaces from table cells, list items
if self.mode in ["strong", "em", "i", "b"]:
chars = f"<{self.mode}>{chars}</{self.mode}>"
self.buffer += chars
# Parse the HTML
sax.parseString(u"<doc>%s</doc>" % html, Handler())
return elements
This a snippet of my HTML source where I have an ordered list:
<ol>
<li>
Step 1
</li>
<li>
Step 2
<ol>
<li>
Multi level step 1
</li>
<li>
Multi level step 2
</li>
</ol>
</li>
<li>
Step 3
</li>
<li>
Step 4
<ol>
<li>
Multi level step 1
</li>
</ol>
</li>
<li>
Finish
</li>
</ol>
In the output, I am missing steps 2 and 4, as seen below:
1. Step 1
1. Multi level step 1
2. Multi level step 2
3. Step 3
1. Multi level step 1
5. Finish
