Python script that adds/replaces XML tags

60 Views Asked by At

I have this Python script that is supposed to find existing tags in an XML document and replace them with new, more descriptive tags. The problem is that after I run the script, it seems to only catch every few instances of the text string that I input. I am sure there is some reasoning behind why it's behaving this way, but I can't seem to figure it out.

import xml.etree.ElementTree as ET
from lxml import etree

def replace_specific_line_tags(input_file, output_file, replacements):
    # Parse the XML file using lxml
    tree = etree.parse(input_file)
    root = tree.getroot()

    for target_text, replacement_tag in replacements:
        # Find all <line> tags with the specific target text under <content> and replace them with the new tag
        for line_tag in root.xpath('.//content/page/line[contains(., "{}")]'.format(target_text)):
            parent = line_tag.getparent()

            # Create the new tag with the desired tag name
            new_tag = etree.Element(replacement_tag)

            # Copy the attributes of the original <line> tag to the new tag
            for attr, value in line_tag.attrib.items():
                new_tag.set(attr, value)

            # Copy the text of the original <line> tag to the new tag
            new_tag.text = line_tag.text

            # Replace the original <line> tag with the new tag
            parent.replace(line_tag, new_tag)

    # Write the updated XML back to the file
    with open(output_file, 'wb') as f:
        tree.write(f, encoding='utf-8', xml_declaration=True)

if __name__ == '__main__':
    input_file_name = 'beforeTagEdits.xml'
    output_file_name = 'afterTagEdits.xml'
    
    # List of target texts and their corresponding replacement tags
    replacements = [
        ('The Washington Post', 'title'),

        # Add more target texts and their replacement tags as needed
    ]
    
    replace_specific_line_tags(input_file_name, output_file_name, replacements)

Being that the code is working, just not as completely intended, I have tried changing some of the text strings to match known, exact strings in the original file, but that doesn't seem to fix the problem. Here is an example of the XML document as it currently stands:

<root>
     <content>
          <line>The Washington Post</line>
          <line>The Washington Post</line>
     </content>
</root>
1

There are 1 best solutions below

0
Hermann12 On

You can iter() your tree and rename the tags after find your searched text:

import xml.etree.ElementTree as ET

xml= """<root>
     <content>
          <line>The Washington Post</line>
          <line>The Washington Post</line>
          <tag>Der Spiegel</tag>
     </content>
</root>"""

root = ET.fromstring(xml)

pattern ={'title':['The Washington Post', 'Der Spiegel']}

for k, v in pattern.items():
    for elem in root.iter():
        if elem.text in v:
            elem.tag = k
            
ET.dump(root)

Output:

<root>
     <content>
          <title>The Washington Post</title>
          <title>The Washington Post</title>
          <title>Der Spiegel</title>
     </content>
</root>