Code to extract information from a PDF using excel

28 Views Asked by At

I tried to extract specific information from the first page of a pdf and open the folder. However, when i run the code nothing happens.

Sub OpenFolder()
    Dim MyFolder As String
    MyFolder = "C:\Users\mixna\Documents\autoFeedback Digital Platform\Group Assignment"
    ActiveWorkbook.FollowHyperlink MyFolder
End Sub

Sub for_each_file_in_folder()
    Dim oFSO As Object, oFolder As Object, oFile As Object
    Set oFSO = CreateObject("Scripting.FileSystemObject")
    Set oFolder = oFSO.GetFolder("C:\Users\mixna\Documents\autoFeedback Digital Platform\Group Assignment")
    
    For Each oFile In oFolder.Files
        If InStr(oFile.Name, ".pdf") > 0 Then
            Debug.Print oFile.Path
            Call read_pdf(oFile.Path)
        End If
    Next oFile
End Sub

Function read_pdf(in_file_path As String)
    Dim word_app As Object, word_doc As Object
    Dim all_text As String, lines As Variant, line As Variant
    Dim groupNumber As String
    Dim i As Integer, startIndex As Integer, endIndex As Integer
    
    Set word_app = CreateObject("Word.Application")
    word_app.Visible = False
    Set word_doc = word_app.Documents.Open(in_file_path, False, Format:=wdOpenFormatAuto, ConfirmConversions:=False)
    
    all_text = word_doc.Content.Text
    word_doc.Close False
    word_app.Quit
    Set word_doc = Nothing
    Set word_app = Nothing

    lines = Split(all_text, vbCrLf)
    
    ' Loop through all lines of text to extract the group number
    For i = LBound(lines) To UBound(lines)
        If InStr(lines(i), "IAR Group No.") > 0 Then
            groupNumber = Trim(Split(Split(lines(i), "IAR Group No.")(1), "_")(0))
            Exit For
        End If
    Next i

    ' Loop through all lines of text to identify the start and end of the table
    For i = LBound(lines) To UBound(lines)
        If InStr(lines(i), "First Name") > 0 Then
            startIndex = i
        ElseIf startIndex > 0 And Trim(lines(i)) = "" Then
            endIndex = i
            Exit For
        End If
    Next i

    ' Write extracted information to the Excel sheet
    Dim ws As Worksheet
    Set ws = ThisWorkbook.Sheets("Group List") ' Adjust the sheet name to match your workbook
    Dim lastRow As Long
    
    If startIndex > 0 And endIndex > 0 Then
        For j = startIndex + 1 To endIndex - 1 ' Loop through the table lines
            line = lines(j)
            Dim entryParts As Variant
            entryParts = Split(line, vbTab) ' Adjust the delimiter based on the actual text format
            If UBound(entryParts) >= 2 Then ' Ensure there are at least three columns
                lastRow = ws.Cells(ws.Rows.Count, "B").End(xlUp).Row + 1 ' Find the next empty row based on the 'First Name' column
                ws.Cells(lastRow, "A").Value = groupNumber
                ws.Cells(lastRow, "B").Value = entryParts(0) ' First Name
                ws.Cells(lastRow, "C").Value = entryParts(1) ' Surname
                ws.Cells(lastRow, "D").Value = entryParts(2) ' ID Number
                ' Marks column not handled here since there's no mark data in the PDF example provided
            End If
        Next j
    End If
End Function

This is my entire to open the folder and extract the pdf. The folder opens and word opens no information is saved into my excel sheet. There are no errors being shown.

0

There are 0 best solutions below