Data extraction failed. Check XPath expressions

35 Views Asked by At

I am using below code to make entries of INDIAN passport we have in possession, here is the code, the xpath is correct but the file is printing error, Data extraction failed. Check XPath expressions.

also need to change html_content = "<html>...</html>" # Replace with actual HTML content

import webbrowser
import pandas as pd
from lxml import html

def get_user_input():
    return input("Enter the file number (or 'end' to exit): ")

def generate_url(file_number):
    base_url = "https://portal2.passportindia.gov.in/AppOnlineProject/statusTracker/trackStatusForFileNoNew?fileNo="
    return base_url + file_number

def scrape_data(html_content):
    tree = html.fromstring(html_content)

    # Example XPath expressions (replace with actual ones)
    extracted_file_number = tree.xpath("/html/body/table/tbody/tr/td/table/tbody/tr[1]/td/div/table/tbody/tr[6]/td/table/tbody/tr/td[2]/form/div/table/tbody/tr[1]/td/table/tbody/tr[1]/td[2]")
    extracted_dob = tree.xpath("/html/body/table/tbody/tr/td/table/tbody/tr[1]/td/div/table/tbody/tr[6]/td/table/tbody/tr/td[2]/form/div/table/tbody/tr[1]/td/table/tbody/tr[2]/td[2]")
    extracted_given_name = tree.xpath("/html/body/table/tbody/tr/td/table/tbody/tr[1]/td/div/table/tbody/tr[6]/td/table/tbody/tr/td[2]/form/div/table/tbody/tr[1]/td/table/tbody/tr[3]/td[2]")
    extracted_surname = tree.xpath("/html/body/table/tbody/tr/td/table/tbody/tr[1]/td/div/table/tbody/tr[6]/td/table/tbody/tr/td[2]/form/div/table/tbody/tr[1]/td/table/tbody/tr[4]/td[2]")
    extracted_received_date = tree.xpath("/html/body/table/tbody/tr/td/table/tbody/tr[1]/td/div/table/tbody/tr[6]/td/table/tbody/tr/td[2]/form/div/table/tbody/tr[1]/td/table/tbody/tr[6]/td[2]")

    # Check if any data was extracted
    if extracted_file_number:
        extracted_data = {
            "File Number": extracted_file_number[0],
            "Date of Birth": extracted_dob[0],
            "Given Name": extracted_given_name[0],
            "Surname": extracted_surname[0],
            "Application Received Date": extracted_received_date[0],
        }
        return extracted_data
    else:
        print("Data extraction failed. Check XPath expressions.")
        return None

def store_data(data):
    df = pd.DataFrame(data, index=[0])
    with pd.ExcelWriter("E:\\passport_data.xlsx", mode="a", engine="openpyxl") as writer:
        df.to_excel(writer, index=False, header=False)

def main():
    while True:
        file_number = get_user_input()
        if file_number.lower() == "end":
            print("Exiting the program.")
            break

        url = generate_url(file_number)
        webbrowser.open(url)

        # Placeholder for actual HTML content (retrieve from the opened URL)
        html_content = "<html>...</html>"  # Replace with actual HTML content
        extracted_data = scrape_data(html_content)
        if extracted_data:
            store_data(extracted_data)
            print("Data saved to passport_data.xlsx")
        else:
            print("Data extraction failed. Please check the website or try another file number.")

if __name__ == "__main__":
    main()

I want the python file to create excel data when entering file number of the passport, it will go to the passport seva link and extract data from the site, after that it will store the same in excel file in table format.

0

There are 0 best solutions below