I am parsing an HTML script using lxml.etree library. I am facing a weird issue where when I parse the same exact script and get the content using a different XPATH, the encoding of the retrieved text becomes different.
Here is a reproducible code:
from lxml import etree
from io import StringIO
def parse_html(html_, xpath):
html_parser = etree.HTMLParser()
listing_page_parsed = etree.parse(StringIO(html_), html_parser).xpath(xpath)
listing_page_parsed = [etree.tostring(item, encoding='unicode') for item in listing_page_parsed]
return listing_page_parsed
listing_page = """<div class="positions-container" style="position:_|_relative;_|_height:_|_4309.41px;" a="a"> <div class="item-container_|_software_development_and_architecture_|_remote_work_|_helsinki_|_jyväskylä_|_lahti_|_oulu_|_rauma_|_tampere_|_turku_|_vaasa" style="position:_|_absolute;_|_left:_|_0px;_|_top:_|_0px;" a="a"> </div> </div>"""
xpath_1 = "//*"
xpath_2 = '//div[@class="positions-container"]//div[contains(@class,"item-container")]'
result_1 = parse_html(listing_page, xpath_1)[0]
print("Parsing using xpath_1:", f"...{result_1[190:220]}...")
result_2 = parse_html(listing_page, xpath_2)[0]
print("Parsing using xpath_2:", f"...{result_2[85:125]}...")
Outputs:
Parsing using xpath_1: ...lsinki_|_jyväskylä_|_lahti_|_o...
Parsing using xpath_2: ...lsinki_|_jyväskylä_|_lahti_|_o...