Extract files embedded in doc files using python

42 Views Asked by At

I have doc files embedded with excel, ppt, pdf, etc... And I want to extract these embedded files, I am using python. Extracting from docx is straight with zipfile lib. With doc, I am able to get the .ole files (see below code that is based on something I found online), but how can I extract the actual files after that?

Output: screenshot of folder

# extract embedded using OleFileIO_PL alone 
import os 
import OleFileIO_PL 
def extract_embedded_ole(fname): 
    ole = OleFileIO_PL.OleFileIO(fname) 
    i = 0 
    for stream in ole.listdir(): 
        for s in stream: 
            if isinstance(stream, list) and len(stream) > 1: 
               i += 1 
               if ole.get_type(stream) == 2 and s in ['Workbook', 'WordDocument', 'Package', 'WordDocument', 'VisioDocument', 'PowerPoint Document', 'Book', 'CONTENTS']: 
                    ole_stream = ole.openstream(stream) 
                    ole_props = ole.getproperties(['\x05SummaryInformation']) 
                    out_dir = "res--" + fname + ".embeddings/" + "/".join(stream[:-1]) 
                    try: 
                        os.makedirs(out_dir) 
                    except OSError: 
                        pass 

                    # Write out Streams 
                    out_name = out_dir + "/" + os.path.split(fname)[1] + "-emb-" + s + "-" + str(i) + ".ole" 
                    with open(out_name, 'w+b') as out_file: 
                        out_file.write(ole_stream.read()) 

# Example usage 
extract_embedded_ole("f1.doc")
0

There are 0 best solutions below