Python - electoral data extraction and regex

56 Views Asked by At

I'm trying to analyse the voter data for Nagaland using python. I've downloaded the data from (https://old.eci.gov.in/electoral-roll/link-to-pdf-e-roll/). The data is in pdf format and I need to convert it into excel for my analysis. Here is my code.

Note: I'm using pypdf2 version 2.12.1 (PyPDF2==2.12.1)

import os
import PyPDF2
import re
import pandas as pd
import numpy as np

directory = 'E:\Python\Voter' #My folder location where I've downloaded the data ( list of pdfs)

for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        src_file = open(os.path.join(directory, filename),'rb')
        pdfreader = PyPDF2.PdfFileReader(src_file)
        num_pg = pdfreader.getNumPages()

        start_pno = 2
        end_pno = num_pg-1
        
        for pg in range(start_pno,end_pno):
            pageob = pdfreader.getPage(pg)
            try:
                dest_file = open('pdf_content.txt','a')
            except FileNotFoundError:
                dest_file = open('pdf_content.txt','w')

            dest_file.write(pageob.extractText())
            dest_file.close()

        src_file.close()

out_file = open('pdf_line_content.txt','w')
new_file = open('pdf_content.txt','rb')

s = new_file.read()
strn = re.split(' No',str(s))
out_file.write('\n'.join(strn))

new_file.close()
out_file.close()

out_fl = open('pdf_line_content.txt','r')

row = []

for eachline in out_fl.readlines():
    h_no = re.findall(r'\s:\s(.*?)Gender',eachline)
    gender = re.findall(r'.Gender\s:\s(.*?)Age',eachline)
    name = re.findall(r'.\dName\s:\s(.*?)\s',eachline)
    age = re.findall(r'[A-Z]*\s\s(\d\d)\s.\w',eachline)
    f_name = re.findall(r'\sName\s:\s(.*?)\s\s\d\d',eachline)
    row.append((h_no,gender,name,age,f_name))

out_fl.close()
os.remove('pdf_content.txt')
os.remove('pdf_line_content.txt')

df = pd.DataFrame(row, columns = ['House No', 'Gender', 'Name', 'Age',
                                  'Father\'s/Husband\'s Name'])

for colmn in df.columns:
    df[colmn] = df[colmn].apply(lambda i: ''.join(i)) 

df.replace('', np.nan, inplace=True)
df.dropna(how = 'all', inplace=True)

writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Content')

writer.close()

pageob.extractText() -- Output as below. enter image description here

However, my final output is as below:

enter image description here

Looks like I'm missing something on regex or mapping them. I want my output should has data instead of NaN. Could someone help me out in fixing this or any inputs are appreciated.

0

There are 0 best solutions below