ARM64 Code Injection on Apple M1 crashes with EXC_BAD_ACCESS

304 Views Asked by At

I have the following code to inject a payload into a process. The payload will then call dlopen to load a dll from disk.

When I inject the payload into my own process getpid(), it works fine, I can see the dll loads and prints correctly.

When I inject it into another process, I get:

0x104cc0030: ldr x0, [x29, #0x18]

Thread 2: EXC_BAD_ACCESS (code=1, address=0x10005e85c)

This ldr x0, [x29, #0x18] is when the code does _pthread_set_self(thread_id);. It seems whenever any code tries to read thread_id, it gives a bad access error. I don't see how though because my stack is read/write and my code is read/execute.

The code for injecting is as follows:

#include <thread>
#include <utility>
#include <cstdint>
#include <string>
#include <sys/types.h>

class Injector
{
public:
    static bool Inject(std::string module_path, std::int32_t pid, void* bootstrap) noexcept;
};


// Implementation

#if defined(__APPLE__)
#include <dlfcn.h>
#include <sys/sysctl.h>
#include <mach/mach.h>
#include <mach/mach_vm.h>
#include <mach-o/loader.h>
#include <mach-o/dyld_images.h>
#include <mach-o/nlist.h>
#include <ptrauth.h>
#include <pthread.h>

#include <cstdint>
#include <string>
#endif

#if defined(__APPLE__)
//Calls dlopen inside of a pthread in the remote target
auto remote_load_library = [](std::size_t* instructions_size) -> std::uint8_t* {
    static std::uint8_t assembly[] = {
        0xFD, 0x7B, 0xBD, 0xA9,    //stp x29, x30, [sp, #-48]!
        0xF5, 0x0B, 0x00, 0xF9,    //str x21, [sp, #16]
        0xF4, 0x4F, 0x02, 0xA9,    //stp x20, x19, [sp, #32]
        0xFD, 0x03, 0x00, 0x91,    //mov x29, sp
        0x02, 0x4C, 0x40, 0xA9,    //ldp x2, x19, [x0]           //_dlopen_pointer/_pthread_callback = data[0]
        0x08, 0x50, 0x41, 0xA9,    //ldp x8, x20, [x0, #16]      //_pthread_create_from_mach_thread = data[2]
        0x15, 0x10, 0x40, 0xF9,    //ldr x21, [x0, #32]          //_mach_thread_self = data[4]
        0xBF, 0x0F, 0x00, 0xF9,    //str xzr, [x29, #24]         //pthread_t thread_id = 0;
        0xE3, 0x03, 0x01, 0xAA,    //mov x3, x1                  //dll_path
        0xE1, 0x03, 0x1F, 0xAA,    //mov x1, xzr                 //nullptr
        0xA0, 0x63, 0x00, 0x91,    //add x0, x29, #24            //&thread_id
        0x00, 0x01, 0x3F, 0xD6,    //blr x8                      //_pthread_create_from_mach_thread(&thread_id, nullptr, _dlopen_pointer, dll_path)
        0xA0, 0x0F, 0x40, 0xF9,    //ldr x0, [x29, #24]          //thread_id
        0x60, 0x02, 0x3F, 0xD6,    //blr x19                     //_pthread_set_self(thread_id)
        0xA0, 0x02, 0x3F, 0xD6,    //blr x21                     //_mach_thread_self()
        0x80, 0x02, 0x3F, 0xD6,    //blr x20                     //_thread_suspend(_mach_thread_self())
        0xA0, 0x0F, 0x40, 0xF9,    //ldr x0, [x29, #24]          //return thread_id
        0xF4, 0x4F, 0x42, 0xA9,    //ldp x20, x19, [sp, #32]
        0xF5, 0x0B, 0x40, 0xF9,    //ldr x21, [sp, #16]
        0xFD, 0x7B, 0xC3, 0xA8,    //ldp x29, x30, [sp], #48
        0xC0, 0x03, 0x5F, 0xD6,    //ret

        0x1F, 0x20, 0x03, 0xD5,
        0x1F, 0x20, 0x03, 0xD5,
        0x1F, 0x20, 0x03, 0xD5,    //nop
        0x1F, 0x20, 0x03, 0xD5,
        0x1F, 0x20, 0x03, 0xD5,

        0x41, 0x00, 0x80, 0x52,    //mov w1, #2                  //RTLD_NOW = 0x02
        0xE2, 0xDD, 0x97, 0xD2,    //mov x2, #0xBEEF             //address of dlopen
        0xA2, 0xD5, 0xBB, 0xF2,    //movk x2, #0xDEAD, lsl #16   //address of dlopen
        0x02, 0x00, 0xD6, 0xF2,    //movk x2, #0xB000, lsl #32   //address of dlopen
        0x02, 0x00, 0xF4, 0xF2,    //movk x2, #0xA000, lsl #48   //address of dlopen
        0x40, 0x00, 0x1F, 0xD6,    //br x2                       //call dlopen(pthread_parameters, RTLD_LAZY)
    };
    
    //The above assembly is equal to:
    /*pthread_t LoadLibrary(void** data, void* dll_path)
     {
         void* (*_dlopen_pointer)(void* param) = data[0];
         void (*_pthread_set_self)(pthread_t thread) = data[1];
         int (*_pthread_create_from_mach_thread)(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *) = data[2];
         kern_return_t (*_thread_suspend)(thread_read_t target_act) = data[3];
         mach_port_t (*_mach_thread_self)(void) = data[4];

         pthread_t thread_id = 0;
         _pthread_create_from_mach_thread(&thread_id, nullptr, _dlopen_pointer, dll_path);         
         _pthread_set_self(thread_id);
         _thread_suspend(_mach_thread_self());
         return thread_id;
     }

     void* _dlopen_pointer(void* param)
     {
         decltype(dlopen)* _dlopen = (decltype(dlopen)*)0xA000B000DEADBEEF;
         return _dlopen((const char*)param, RTLD_NOW);
     }*/

    *instructions_size = sizeof(assembly);
    return &assembly[0];
};

bool Injector::Inject(std::string module_path, std::int32_t pid, void* bootstrap) noexcept
{
    std::size_t assembly_size = 0;
    std::uint8_t* assembly = remote_load_library(&assembly_size);

    //Retrieve a task port for the remote process..
    mach_port_t    remote_task = 0;
    mach_error_t err = task_for_pid(mach_task_self(), pid, &remote_task);
    if (err == 5)
    {
        fprintf(stderr, "Could not access task for pid %d. You probably need to add user to procmod group OR run this program as root\n", pid);
        return false;
    }

    std::uint64_t stack_size = 16 * 1024;

    // Allocate and write the path size..
    mach_vm_address_t remote_path = reinterpret_cast<mach_vm_address_t>(nullptr);
    mach_vm_allocate(remote_task, &remote_path, module_path.size() + 1, VM_FLAGS_ANYWHERE);
    mach_vm_write(remote_task, remote_path, reinterpret_cast<mach_vm_offset_t>(module_path.c_str()), static_cast<mach_msg_type_number_t>(module_path.size()));
    mach_vm_protect(remote_task, remote_path, module_path.size() + 1, 0, VM_PROT_READ | VM_PROT_WRITE);

    //Update our dlopen address..
    void* dlopen_ptr = dlsym(RTLD_DEFAULT, "dlopen");
    
    // Bits 21 to 5 = imm of the MOV wide immediate instruction
    // https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/MOV--wide-immediate---Move--wide-immediate---an-alias-of-MOVZ-?lang=en

    auto copy_bits = [](std::uint32_t &reg, std::uint16_t value) {
        for (int bit = 20, valueBit = 15; bit >= 5; --bit, --valueBit)
        {
            std::uint32_t bit_to_set = ((value >> valueBit) & 1);
            reg ^= (-bit_to_set ^ reg) & (static_cast<std::uint32_t>(1) << bit);
        }
    };
    
    // Convert the instruction bytes to 32-bit instruction
    auto decode_instruction = [](std::uint8_t instructions[]) -> std::uint32_t {
        //Note endianness
        return (static_cast<std::uint32_t>(instructions[3]) << 24) |
               (static_cast<std::uint32_t>(instructions[2]) << 16) |
               (static_cast<std::uint32_t>(instructions[1]) << 8) |
               (static_cast<std::uint32_t>(instructions[0]) << 0);
    };
    
    // Convert the 32-bit instruction back into instruction bytes
    auto encode_instruction = [](std::uint32_t instruction, std::uint8_t (&instructions)[4]) {
        //Note endianness
        instructions[3] = (instruction & 0xFF000000) >> 24;
        instructions[2] = (instruction & 0x00FF0000) >> 16;
        instructions[1] = (instruction & 0x0000FF00) >> 8;
        instructions[0] = (instruction & 0x000000FF) >> 0;
    };
    
    // Get the instructions offset, and write the address of dlopen to each part, 16-bits at a time
    auto write_instruction_address = [&](std::uint32_t address_intermediate, std::uint8_t assembly[], std::size_t offset) {
        std::uint8_t instructions[] = {0x00, 0x00, 0x00, 0x00};
        memcpy(&instructions, &assembly[assembly_size + offset], sizeof(instructions));
        
        std::uint32_t instruction = decode_instruction(instructions);
        copy_bits(instruction, address_intermediate);
        encode_instruction(instruction, instructions);
        memcpy(&assembly[assembly_size + offset], &instructions, sizeof(instructions));
    };
    
    // Convert the dlopen address to its 16-bit parts
    std::uintptr_t dlopen_address = reinterpret_cast<std::uintptr_t>(dlopen_ptr);
    std::uint32_t beef = ((dlopen_address & 0x000000000000FFFF) >> 0);
    std::uint32_t dead = ((dlopen_address & 0x00000000FFFF0000) >> 16);
    std::uint32_t b000 = ((dlopen_address & 0x0000FFFF00000000) >> 32);
    std::uint32_t a000 = ((dlopen_address & 0xFFFF000000000000) >> 48);
    
    // Write the encoded instructions back into the assembly payload
    // So it _dlopen_pointer will have the real address instead of 0xA000B000DEADBEEF
    write_instruction_address(a000, assembly, -8);
    write_instruction_address(b000, assembly, -12);
    write_instruction_address(dead, assembly, -16);
    write_instruction_address(beef, assembly, -20);

    //Allocate and write our remote code
    mach_vm_address_t remote_code = reinterpret_cast<mach_vm_address_t>(nullptr);
    mach_vm_allocate(remote_task, &remote_code, assembly_size, VM_FLAGS_ANYWHERE);
    mach_vm_write(remote_task, remote_code, reinterpret_cast<mach_vm_offset_t>(&assembly[0]), static_cast<mach_msg_type_number_t>(assembly_size));
    mach_vm_protect(remote_task, remote_code, assembly_size, false, VM_PROT_READ | VM_PROT_EXECUTE);

    //Allocate remote stack
    mach_vm_address_t remote_stack = reinterpret_cast<mach_vm_address_t>(nullptr);
    mach_vm_allocate(remote_task, &remote_stack, stack_size, VM_FLAGS_ANYWHERE);
    mach_vm_protect(remote_task, remote_stack, stack_size, true, VM_PROT_READ | VM_PROT_WRITE);

    //Allocate & write parameters
    void* parameters[] = {
        (void*)((remote_code + assembly_size) - 24),
        (void*)dlsym(RTLD_DEFAULT, "_pthread_set_self"),
        (void*)dlsym(RTLD_DEFAULT, "pthread_create_from_mach_thread"),
        (void*)dlsym(RTLD_DEFAULT, "thread_suspend"),
        (void*)dlsym(RTLD_DEFAULT, "mach_thread_self")
    };

    mach_vm_address_t remote_parameters = reinterpret_cast<mach_vm_address_t>(nullptr);
    mach_vm_allocate(remote_task, &remote_parameters, sizeof(parameters), VM_FLAGS_ANYWHERE);
    mach_vm_write(remote_task, remote_parameters, reinterpret_cast<mach_vm_offset_t>(&parameters[0]), static_cast<mach_msg_type_number_t>(sizeof(parameters)));

    //Offset stack pointer.
    mach_vm_address_t local_stack = remote_stack;
    remote_stack += (stack_size / 2);  //real stack location
    
    
    // To support ARMv7 and ARMv8, we use arm_unified_thread_state_t intead of arm_thread_state64_t
    arm_unified_thread_state_t state = {0};
    memset(&state, 0, sizeof(state));

    //Parameter order for aarch64: x0, x1, x2, x3, x4, x5
    state.ash.flavor = ARM_THREAD_STATE64;
    state.ash.count = ARM_THREAD_STATE64_COUNT;
    state.ts_64.__x[0] = remote_parameters; //pointers to functions
    state.ts_64.__x[1] = remote_path;       //path of module to load
    state.ts_64.__pc = (mach_vm_address_t)remote_code; //code/payload to execute
    state.ts_64.__sp = remote_stack;
    state.ts_64.__lr = 0x0000000000000000;  //Return address. Thread should suspend anyway.

    //Create our remote thread
    thread_act_t thread;
    err = thread_create_running(remote_task, ARM_THREAD_STATE64, (thread_state_t) &state.ts_64, ARM_THREAD_STATE64_COUNT, &thread);
    if (err != KERN_SUCCESS)
    {
        fprintf(stderr, "ERROR!\n");
        return false;
    }

    return true;
}
#endif

int main(int argc, const char * argv[]) {
    printf("Running\n");

    const char* dll_path = "/users/brandon/Desktop/test.dylib";
    Injector::Inject(dll_path, getpid(), nullptr);
    
    return 0;
}

Any ideas why I get EXC_BAD_ACCESS when replacing getpid() in main with another process' PID?

0

There are 0 best solutions below