how to generate a pst from folders and eml files with python

45 Views Asked by At

I currently have a script that generates email backups from an email account over an IMAP connection. This script downloads all emails in EML format, also creates subfolders and saves emails in their respective subfolders. I'm looking for a way to generate the PST file from what's been downloaded, and I've tried manipulating Outlook with Win32, but ran into difficulties in the process. Does anyone know a method through a library or using a program to achieve this?

*Code for downloading emails:

import imaplib
import email
from email.header import decode_header
import os
import re

# Configuración de la cuenta de correo
email_address = '[email protected]'
password = 'pass'
server = 'server.serveremail.com'

# Conexión al servidor IMAP de Rackspace
mail = imaplib.IMAP4_SSL(server)

# Inicio de sesión
mail.login(email_address, password)

# Obtiene una lista de todas las carpetas
status, folders = mail.list()

# Carpeta para almacenar los archivos de respaldo
backup_root_folder = 'backup_emails'
backup_folder = ''
os.makedirs(backup_root_folder, exist_ok=True)

# Recorre todas las carpetas y respalda los mensajes de cada una
for folder_info in folders:
    print(folder_info)
    # Decodifica la información de la carpeta
    folder_info_decoded = str(folder_info)
    folder_name_regex = re.compile(r'\".\" \"?INBOX(\.([^\"]+))?(\"|\')?')
    # Extrae el nombre de la carpeta usando expresiones regulares
    match = folder_name_regex.search(folder_info_decoded)
    if match:
        folder_name = match.group(2) if match.group(2) else "INBOX"
        # Verifica si el nombre de la carpeta contiene caracteres especiales
        if re.search(r'[<>:"/\\|?*]', folder_name):
            print(f"La carpeta '{folder_name}' contiene caracteres especiales y será omitida.")
            continue
        print("************* SIPASO***** " + folder_name)

        # Crea la carpeta de respaldo correspondiente
        if folder_name != "INBOX":
            folder_name =  folder_name.replace(".", os.path.sep)
            folder_name = folder_name.replace("'", "")
            backup_folder = os.path.join(backup_root_folder, folder_name)
            print(backup_folder)
            os.makedirs(backup_folder, exist_ok=True)

        # Selecciona la carpeta actual
        try:
            status, _ = mail.select(folder_name)
            if status != 'OK':
                print(f"No se pudo seleccionar la carpeta {folder_name}. Estado: {status}")
                continue
            else:
                print(status+" si seleccionamos----------------****>>>"+folder_name)
        except Exception as e:
            print(f"Error al seleccionar la carpeta {folder_name}: {e}")
            continue
        # Busca todos los mensajes en la carpeta actual
        try:
            status, messages = mail.uid('search', None, 'ALL')
            if status != 'OK':
                print(f"No se pudo buscar mensajes en la carpeta {folder_name}. Estado: {status}")
                continue
            else:
                print('si se obtuvieron los mensajes')
        except Exception as e:
            print(f"Error al buscar mensajes en la carpeta {folder_name}: {e}")
            continue
        print('*********************************************************************************')
        # Descarga cada mensaje y guarda como archivos .eml en la carpeta de respaldo
        for msg_id in messages[0].split():
            _, msg_data = mail.uid('fetch', msg_id, '(RFC822)')
            msg = email.message_from_bytes(msg_data[0][1])

            # Decodifica el asunto del mensaje para usarlo como nombre de archivo
            if msg.get('Subject'):
                subject, encoding = decode_header(msg.get('Subject'))[0]
                if isinstance(subject, bytes):
                    subject = subject.decode(encoding or 'utf-8')
            else:
                subject = 'Sin asunto'
            # Reemplaza los caracteres no permitidos en el nombre del archivo
            cleaned_subject = re.sub(r'[<>:"/\\|?*]', '', subject)

            # Elimina caracteres específicos no permitidos
            invalid_chars = ['', '\uf0fc', '\t']
            for char in invalid_chars:
                cleaned_subject = cleaned_subject.replace(char, '')
            backup_file_path = ''
            folder_name =  folder_name.replace(".", os.path.sep)
            folder_name = folder_name.replace("'", "")
            # Guarda el mensaje en un archivo .eml en la carpeta de respaldo
            if folder_name != "INBOX":
                backup_folder = os.path.join(backup_root_folder, folder_name)
                backup_file_path = os.path.join(backup_folder, f'{cleaned_subject}.eml')
            else:
                backup_file_path = os.path.join(backup_root_folder, f'{cleaned_subject}.eml')
            # Imprime el nombre de archivo antes de intentar guardarlo
            print("Guardando mensaje como:", backup_file_path)

            # Abre el archivo con el argumento 'wb' configurado para manejar todos los caracteres
            
            with open(backup_file_path, 'wb') as backup_file:
                backup_file.write(msg_data[0][1])
    else:

        print(f"No se pudo obtener el nombre de la carpeta:")
# Cierra la conexión
mail.logout()

print("Backup completado. Mensajes guardados en las carpetas de respaldo.")

Failed attempt to create PST file:

import os
import win32com.client

def create_pst_from_eml(source_folder, pst_file_path):
    Outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    if not os.path.exists(pst_file_path):
        Outlook.AddStore(pst_file_path)
    try:
        PSTFolderObj = find_pst_folder(Outlook, pst_file_path)
        for root, _, files in os.walk(source_folder):
            for file_name in files:
                if file_name.endswith('.eml'):
                    eml_file_path = os.path.join(root, file_name)
                    with open(eml_file_path, 'rb') as eml_file:
                        eml_data = eml_file.read()
                        msg = BytesParser(policy=policy.default).parsebytes(eml_data)
                        PSTFolderObj.Items.Add(msg)
    except Exception as exc:
        print(exc)
    finally:
        Outlook.RemoveStore(PSTFolderObj)

def find_pst_folder(OutlookObj, pst_filepath):
    for Store in OutlookObj.Stores:
        if Store.IsDataFileStore and Store.FilePath == pst_filepath:
            return Store.GetRootFolder()
    return None

def enumerate_folders(FolderObj):
    for ChildFolder in FolderObj.Folders:
        enumerate_folders(ChildFolder)
    iterate_messages(FolderObj)

def iterate_messages(FolderObj):
    for item in FolderObj.Items:
        print("***************************************")
        print(item.SenderName)
        print(item.SenderEmailAddress)
        print(item.SentOn)
        print(item.To)
        print(item.CC)
        print(item.BCC)
        print(item.Subject)

        count_attachments = item.Attachments.Count
        if count_attachments > 0:
            for att in range(count_attachments):
                print(item.Attachments.Item(att + 1).Filename)

if __name__ == "__main__":
    source_folder = r"C:\Users\\Desktop\respaldo\backup_emails"
    pst_file_path = r"C:\Users\\Desktop\respaldo\backup_emails.pst"
    create_pst_from_eml(source_folder, pst_file_path)

Additional comments: The first script works on Windows and downloads the emails in EML format. The second script attempts to create a PST file from the downloaded EML files, but has encountered problems in the process. I appreciate any suggestions or solutions that can help resolve this issue.

0

There are 0 best solutions below