I have the following code to inject a payload into a process.
The payload will then call dlopen to load a dll from disk.
When I inject the payload into my own process getpid(), it works fine, I can see the dll loads and prints correctly.
When I inject it into another process, I get:
0x104cc0030: ldr x0, [x29, #0x18]
Thread 2: EXC_BAD_ACCESS (code=1, address=0x10005e85c)
This ldr x0, [x29, #0x18] is when the code does _pthread_set_self(thread_id);.
It seems whenever any code tries to read thread_id, it gives a bad access error.
I don't see how though because my stack is read/write and my code is read/execute.
The code for injecting is as follows:
#include <thread>
#include <utility>
#include <cstdint>
#include <string>
#include <sys/types.h>
class Injector
{
public:
static bool Inject(std::string module_path, std::int32_t pid, void* bootstrap) noexcept;
};
// Implementation
#if defined(__APPLE__)
#include <dlfcn.h>
#include <sys/sysctl.h>
#include <mach/mach.h>
#include <mach/mach_vm.h>
#include <mach-o/loader.h>
#include <mach-o/dyld_images.h>
#include <mach-o/nlist.h>
#include <ptrauth.h>
#include <pthread.h>
#include <cstdint>
#include <string>
#endif
#if defined(__APPLE__)
//Calls dlopen inside of a pthread in the remote target
auto remote_load_library = [](std::size_t* instructions_size) -> std::uint8_t* {
static std::uint8_t assembly[] = {
0xFD, 0x7B, 0xBD, 0xA9, //stp x29, x30, [sp, #-48]!
0xF5, 0x0B, 0x00, 0xF9, //str x21, [sp, #16]
0xF4, 0x4F, 0x02, 0xA9, //stp x20, x19, [sp, #32]
0xFD, 0x03, 0x00, 0x91, //mov x29, sp
0x02, 0x4C, 0x40, 0xA9, //ldp x2, x19, [x0] //_dlopen_pointer/_pthread_callback = data[0]
0x08, 0x50, 0x41, 0xA9, //ldp x8, x20, [x0, #16] //_pthread_create_from_mach_thread = data[2]
0x15, 0x10, 0x40, 0xF9, //ldr x21, [x0, #32] //_mach_thread_self = data[4]
0xBF, 0x0F, 0x00, 0xF9, //str xzr, [x29, #24] //pthread_t thread_id = 0;
0xE3, 0x03, 0x01, 0xAA, //mov x3, x1 //dll_path
0xE1, 0x03, 0x1F, 0xAA, //mov x1, xzr //nullptr
0xA0, 0x63, 0x00, 0x91, //add x0, x29, #24 //&thread_id
0x00, 0x01, 0x3F, 0xD6, //blr x8 //_pthread_create_from_mach_thread(&thread_id, nullptr, _dlopen_pointer, dll_path)
0xA0, 0x0F, 0x40, 0xF9, //ldr x0, [x29, #24] //thread_id
0x60, 0x02, 0x3F, 0xD6, //blr x19 //_pthread_set_self(thread_id)
0xA0, 0x02, 0x3F, 0xD6, //blr x21 //_mach_thread_self()
0x80, 0x02, 0x3F, 0xD6, //blr x20 //_thread_suspend(_mach_thread_self())
0xA0, 0x0F, 0x40, 0xF9, //ldr x0, [x29, #24] //return thread_id
0xF4, 0x4F, 0x42, 0xA9, //ldp x20, x19, [sp, #32]
0xF5, 0x0B, 0x40, 0xF9, //ldr x21, [sp, #16]
0xFD, 0x7B, 0xC3, 0xA8, //ldp x29, x30, [sp], #48
0xC0, 0x03, 0x5F, 0xD6, //ret
0x1F, 0x20, 0x03, 0xD5,
0x1F, 0x20, 0x03, 0xD5,
0x1F, 0x20, 0x03, 0xD5, //nop
0x1F, 0x20, 0x03, 0xD5,
0x1F, 0x20, 0x03, 0xD5,
0x41, 0x00, 0x80, 0x52, //mov w1, #2 //RTLD_NOW = 0x02
0xE2, 0xDD, 0x97, 0xD2, //mov x2, #0xBEEF //address of dlopen
0xA2, 0xD5, 0xBB, 0xF2, //movk x2, #0xDEAD, lsl #16 //address of dlopen
0x02, 0x00, 0xD6, 0xF2, //movk x2, #0xB000, lsl #32 //address of dlopen
0x02, 0x00, 0xF4, 0xF2, //movk x2, #0xA000, lsl #48 //address of dlopen
0x40, 0x00, 0x1F, 0xD6, //br x2 //call dlopen(pthread_parameters, RTLD_LAZY)
};
//The above assembly is equal to:
/*pthread_t LoadLibrary(void** data, void* dll_path)
{
void* (*_dlopen_pointer)(void* param) = data[0];
void (*_pthread_set_self)(pthread_t thread) = data[1];
int (*_pthread_create_from_mach_thread)(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *) = data[2];
kern_return_t (*_thread_suspend)(thread_read_t target_act) = data[3];
mach_port_t (*_mach_thread_self)(void) = data[4];
pthread_t thread_id = 0;
_pthread_create_from_mach_thread(&thread_id, nullptr, _dlopen_pointer, dll_path);
_pthread_set_self(thread_id);
_thread_suspend(_mach_thread_self());
return thread_id;
}
void* _dlopen_pointer(void* param)
{
decltype(dlopen)* _dlopen = (decltype(dlopen)*)0xA000B000DEADBEEF;
return _dlopen((const char*)param, RTLD_NOW);
}*/
*instructions_size = sizeof(assembly);
return &assembly[0];
};
bool Injector::Inject(std::string module_path, std::int32_t pid, void* bootstrap) noexcept
{
std::size_t assembly_size = 0;
std::uint8_t* assembly = remote_load_library(&assembly_size);
//Retrieve a task port for the remote process..
mach_port_t remote_task = 0;
mach_error_t err = task_for_pid(mach_task_self(), pid, &remote_task);
if (err == 5)
{
fprintf(stderr, "Could not access task for pid %d. You probably need to add user to procmod group OR run this program as root\n", pid);
return false;
}
std::uint64_t stack_size = 16 * 1024;
// Allocate and write the path size..
mach_vm_address_t remote_path = reinterpret_cast<mach_vm_address_t>(nullptr);
mach_vm_allocate(remote_task, &remote_path, module_path.size() + 1, VM_FLAGS_ANYWHERE);
mach_vm_write(remote_task, remote_path, reinterpret_cast<mach_vm_offset_t>(module_path.c_str()), static_cast<mach_msg_type_number_t>(module_path.size()));
mach_vm_protect(remote_task, remote_path, module_path.size() + 1, 0, VM_PROT_READ | VM_PROT_WRITE);
//Update our dlopen address..
void* dlopen_ptr = dlsym(RTLD_DEFAULT, "dlopen");
// Bits 21 to 5 = imm of the MOV wide immediate instruction
// https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/MOV--wide-immediate---Move--wide-immediate---an-alias-of-MOVZ-?lang=en
auto copy_bits = [](std::uint32_t ®, std::uint16_t value) {
for (int bit = 20, valueBit = 15; bit >= 5; --bit, --valueBit)
{
std::uint32_t bit_to_set = ((value >> valueBit) & 1);
reg ^= (-bit_to_set ^ reg) & (static_cast<std::uint32_t>(1) << bit);
}
};
// Convert the instruction bytes to 32-bit instruction
auto decode_instruction = [](std::uint8_t instructions[]) -> std::uint32_t {
//Note endianness
return (static_cast<std::uint32_t>(instructions[3]) << 24) |
(static_cast<std::uint32_t>(instructions[2]) << 16) |
(static_cast<std::uint32_t>(instructions[1]) << 8) |
(static_cast<std::uint32_t>(instructions[0]) << 0);
};
// Convert the 32-bit instruction back into instruction bytes
auto encode_instruction = [](std::uint32_t instruction, std::uint8_t (&instructions)[4]) {
//Note endianness
instructions[3] = (instruction & 0xFF000000) >> 24;
instructions[2] = (instruction & 0x00FF0000) >> 16;
instructions[1] = (instruction & 0x0000FF00) >> 8;
instructions[0] = (instruction & 0x000000FF) >> 0;
};
// Get the instructions offset, and write the address of dlopen to each part, 16-bits at a time
auto write_instruction_address = [&](std::uint32_t address_intermediate, std::uint8_t assembly[], std::size_t offset) {
std::uint8_t instructions[] = {0x00, 0x00, 0x00, 0x00};
memcpy(&instructions, &assembly[assembly_size + offset], sizeof(instructions));
std::uint32_t instruction = decode_instruction(instructions);
copy_bits(instruction, address_intermediate);
encode_instruction(instruction, instructions);
memcpy(&assembly[assembly_size + offset], &instructions, sizeof(instructions));
};
// Convert the dlopen address to its 16-bit parts
std::uintptr_t dlopen_address = reinterpret_cast<std::uintptr_t>(dlopen_ptr);
std::uint32_t beef = ((dlopen_address & 0x000000000000FFFF) >> 0);
std::uint32_t dead = ((dlopen_address & 0x00000000FFFF0000) >> 16);
std::uint32_t b000 = ((dlopen_address & 0x0000FFFF00000000) >> 32);
std::uint32_t a000 = ((dlopen_address & 0xFFFF000000000000) >> 48);
// Write the encoded instructions back into the assembly payload
// So it _dlopen_pointer will have the real address instead of 0xA000B000DEADBEEF
write_instruction_address(a000, assembly, -8);
write_instruction_address(b000, assembly, -12);
write_instruction_address(dead, assembly, -16);
write_instruction_address(beef, assembly, -20);
//Allocate and write our remote code
mach_vm_address_t remote_code = reinterpret_cast<mach_vm_address_t>(nullptr);
mach_vm_allocate(remote_task, &remote_code, assembly_size, VM_FLAGS_ANYWHERE);
mach_vm_write(remote_task, remote_code, reinterpret_cast<mach_vm_offset_t>(&assembly[0]), static_cast<mach_msg_type_number_t>(assembly_size));
mach_vm_protect(remote_task, remote_code, assembly_size, false, VM_PROT_READ | VM_PROT_EXECUTE);
//Allocate remote stack
mach_vm_address_t remote_stack = reinterpret_cast<mach_vm_address_t>(nullptr);
mach_vm_allocate(remote_task, &remote_stack, stack_size, VM_FLAGS_ANYWHERE);
mach_vm_protect(remote_task, remote_stack, stack_size, true, VM_PROT_READ | VM_PROT_WRITE);
//Allocate & write parameters
void* parameters[] = {
(void*)((remote_code + assembly_size) - 24),
(void*)dlsym(RTLD_DEFAULT, "_pthread_set_self"),
(void*)dlsym(RTLD_DEFAULT, "pthread_create_from_mach_thread"),
(void*)dlsym(RTLD_DEFAULT, "thread_suspend"),
(void*)dlsym(RTLD_DEFAULT, "mach_thread_self")
};
mach_vm_address_t remote_parameters = reinterpret_cast<mach_vm_address_t>(nullptr);
mach_vm_allocate(remote_task, &remote_parameters, sizeof(parameters), VM_FLAGS_ANYWHERE);
mach_vm_write(remote_task, remote_parameters, reinterpret_cast<mach_vm_offset_t>(¶meters[0]), static_cast<mach_msg_type_number_t>(sizeof(parameters)));
//Offset stack pointer.
mach_vm_address_t local_stack = remote_stack;
remote_stack += (stack_size / 2); //real stack location
// To support ARMv7 and ARMv8, we use arm_unified_thread_state_t intead of arm_thread_state64_t
arm_unified_thread_state_t state = {0};
memset(&state, 0, sizeof(state));
//Parameter order for aarch64: x0, x1, x2, x3, x4, x5
state.ash.flavor = ARM_THREAD_STATE64;
state.ash.count = ARM_THREAD_STATE64_COUNT;
state.ts_64.__x[0] = remote_parameters; //pointers to functions
state.ts_64.__x[1] = remote_path; //path of module to load
state.ts_64.__pc = (mach_vm_address_t)remote_code; //code/payload to execute
state.ts_64.__sp = remote_stack;
state.ts_64.__lr = 0x0000000000000000; //Return address. Thread should suspend anyway.
//Create our remote thread
thread_act_t thread;
err = thread_create_running(remote_task, ARM_THREAD_STATE64, (thread_state_t) &state.ts_64, ARM_THREAD_STATE64_COUNT, &thread);
if (err != KERN_SUCCESS)
{
fprintf(stderr, "ERROR!\n");
return false;
}
return true;
}
#endif
int main(int argc, const char * argv[]) {
printf("Running\n");
const char* dll_path = "/users/brandon/Desktop/test.dylib";
Injector::Inject(dll_path, getpid(), nullptr);
return 0;
}
Any ideas why I get EXC_BAD_ACCESS when replacing getpid() in main with another process' PID?