Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linux: More safe stack cleanup for clone #3424

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,21 +394,26 @@ struct StackFramePlusRet {
};

[[noreturn]]
static void Clone3HandlerRet() {
StackFrameData *Data = (StackFrameData*)alloca(0);
static void CloneBody(StackFrameData *Data, bool NeedsDataFree) {
uint64_t Result = FEX::HLE::HandleNewClone(Data->Thread, Data->CTX, &Data->NewFrame, &Data->GuestArgs);
FEX::LinuxEmulation::Threads::DeallocateStackObject(Data->GuestArgs.NewStack);
// To behave like a real clone, we now just need to call exit here
exit(Result);
auto Stack = Data->GuestArgs.NewStack;
if (NeedsDataFree) {
FEXCore::Allocator::free(Data);
}

FEX::LinuxEmulation::Threads::DeallocateStackObjectAndExit(Stack, Result);
FEX_UNREACHABLE;
}

[[noreturn]]
static void Clone3HandlerRet() {
StackFrameData *Data = (StackFrameData*)alloca(0);
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
CloneBody(Data, false);
}

static int Clone2HandlerRet(void *arg) {
StackFrameData *Data = (StackFrameData*)arg;
uint64_t Result = FEX::HLE::HandleNewClone(Data->Thread, Data->CTX, &Data->NewFrame, &Data->GuestArgs);
FEX::LinuxEmulation::Threads::DeallocateStackObject(Data->GuestArgs.NewStack);
FEXCore::Allocator::free(arg);
return Result;
CloneBody(Data, true);
}

// Clone3 flags
Expand Down
4 changes: 3 additions & 1 deletion Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ desc: Glue logic, STRACE magic
#define SYSCALL_ARCH_NAME Arm64
#endif

#include "LinuxSyscalls/x64/SyscallsEnum.h"

#define CONCAT_(a, b) a ## b
#define CONCAT(a, b) CONCAT_(a, b)
#define SYSCALL_DEF(name) ( SYSCALL_ARCH_NAME::CONCAT(CONCAT(SYSCALL_, SYSCALL_ARCH_NAME), _##name))
#define SYSCALL_DEF(name) ( HLE::SYSCALL_ARCH_NAME::CONCAT(CONCAT(SYSCALL_, SYSCALL_ARCH_NAME), _##name))

// #define DEBUG_STRACE

Expand Down
76 changes: 61 additions & 15 deletions Source/Tools/LinuxEmulation/LinuxSyscalls/Utils/Threads.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: MIT
#include "LinuxSyscalls/Utils/Threads.h"
#include "LinuxSyscalls/Syscalls.h"

#include <FEXCore/Core/Context.h>
#include <FEXCore/Utils/Threads.h>
Expand All @@ -12,33 +13,53 @@ namespace FEX::LinuxEmulation::Threads {
void *Ptr;
size_t Size;
};

struct DeadStackPoolItem {
void *Ptr;
size_t Size;
bool ReadyToBeReaped;
};

std::mutex DeadStackPoolMutex{};
std::mutex LiveStackPoolMutex{};

static fextl::deque<StackPoolItem> DeadStackPool{};
static fextl::deque<DeadStackPoolItem> DeadStackPool{};
static fextl::deque<StackPoolItem> LiveStackPool{};

void *AllocateStackObject() {
std::lock_guard lk{DeadStackPoolMutex};
if (DeadStackPool.size() == 0) {
// Nothing in the pool, just allocate
return FEXCore::Allocator::mmap(nullptr, FEX::LinuxEmulation::Threads::STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

// Keep the first item in the stack pool
auto Result = DeadStackPool.front().Ptr;
DeadStackPool.pop_front();
void *Ptr{};

for (auto it = DeadStackPool.begin(); it != DeadStackPool.end();) {
auto Ready = std::atomic_ref<bool>(it->ReadyToBeReaped);
bool ReadyToBeReaped = Ready.load();
if (Ptr == nullptr && ReadyToBeReaped) {
Ptr = it->Ptr;
it = DeadStackPool.erase(it);
continue;
}

// Erase the rest as a garbage collection step
for (auto &Item : DeadStackPool) {
FEXCore::Allocator::munmap(Item.Ptr, Item.Size);
if (ReadyToBeReaped) {
FEXCore::Allocator::munmap(it->Ptr, it->Size);
it = DeadStackPool.erase(it);
continue;
}

++it;
}

if (Ptr == nullptr) {
Ptr = FEXCore::Allocator::mmap(nullptr, FEX::LinuxEmulation::Threads::STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}
return Result;

return Ptr;
}

void AddStackToDeadPool(void *Ptr) {
bool *AddStackToDeadPool(void *Ptr) {
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
std::lock_guard lk{DeadStackPoolMutex};
DeadStackPool.emplace_back(StackPoolItem{Ptr, FEX::LinuxEmulation::Threads::STACK_SIZE});
auto &it = DeadStackPool.emplace_back(DeadStackPoolItem{Ptr, FEX::LinuxEmulation::Threads::STACK_SIZE, false});
return &it.ReadyToBeReaped;
}

void AddStackToLivePool(void *Ptr) {
Expand All @@ -61,6 +82,31 @@ namespace FEX::LinuxEmulation::Threads {
AddStackToDeadPool(Ptr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make sure, is it intentional that this code path doesn't set ReadyToBeReaped?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional stack memory leak that is happening elsewhere and is a pre-existing condition. Subject to refactoring that has yet to occur. It's one of the reasons why I've spent months moving thread management to the frontend.

}

[[noreturn]]
void DeallocateStackObjectAndExit(void *Ptr, int Status) {
RemoveStackFromLivePool(Ptr);
auto ReadyToBeReaped = AddStackToDeadPool(Ptr);
*ReadyToBeReaped = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no other logic in here, and the code reading ReadyToBeReaped is protected by a mutex. Can't we move setting the boolean into AddStackToDeadPool and avoid the obscure bool pointer return altogether hence? This could be optional behavior controlled by a function parameter. (This might also make it clearer why ReadyToBeReaped must be set here but not in the other call site of AddStackToDeadPool.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can move the ReadyToBeReaped store in to AddStackToDeadPool if we continue passing the status all the way to that function so we can call exit there.
At the point of ReadyToBeReaped being set to true, we no longer have ownership of the stack. We can't return from a function, we can't call a function, we must do the syscall immediately.
The mutex guarding this doesn't matter at all, we are dancing around the thread no longer having a stack.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stack ownership is released when setting ReadyToBeReaped indeed, but this ownership isn't re-assigned until AllocateStackObject observes ReadyToBeReaped==1. This observation can't happen as long as DeadStackPoolMutex is locked. Until then, accessing the stack should still be safe.

So instead of the boolean pointer dance, why not keep DeadStackPoolMutex locked until after we return from AddStackDeadPool? This could be achieved for example by moving the mutex locking out of AddStackToDeadPool and locking/unlocking the mutex here manually. (To preserve current behavior elsewhere, the function could be renamed to "AddStackToDeadPoolInternal" and a helper function could be added that behaves like the current code).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed externally, this won't work because unlocking the mutex itself requires a stack. (Leaving discussion open for visibility.)


#ifdef _M_ARM_64
__asm volatile(
"mov x8, %[SyscallNum];"
"mov w0, %w[Result];"
"svc #0;"
:: [SyscallNum] "i" (SYSCALL_DEF(exit))
, [Result] "r" (Status)
: "memory", "x0", "x8");
#else
__asm volatile(
"mov %[Result], %%edi;"
"syscall;"
:: "a" (SYSCALL_DEF(exit))
, [Result] "r" (Status)
: "memory", "rdi");
#endif
FEX_UNREACHABLE;
}

namespace PThreads {
void *InitializeThread(void *Ptr);

Expand Down Expand Up @@ -154,7 +200,7 @@ namespace FEX::LinuxEmulation::Threads {

auto ClearStackPool = [&](auto &StackPool) {
for (auto it = StackPool.begin(); it != StackPool.end(); ) {
StackPoolItem &Item = *it;
auto &Item = *it;
uintptr_t ItemStack = reinterpret_cast<uintptr_t>(Item.Ptr);
if (ItemStack <= StackLocation && (ItemStack + Item.Size) > StackLocation) {
// This is our stack item, skip it
Expand Down
4 changes: 3 additions & 1 deletion Source/Tools/LinuxEmulation/LinuxSyscalls/Utils/Threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ namespace FEX::LinuxEmulation::Threads {
* Will not free the memory immediately, instead saving for reuse temporarily to solve race conditions on stack usage while stack tears down.
*
* @param Ptr The stack base from `AllocateStackObject`
* @param Status The status to pass to the exit syscall.
*/
void DeallocateStackObject(void *Ptr);
[[noreturn]]
void DeallocateStackObjectAndExit(void *Ptr, int Status);
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Registers thread creation handlers with FEXCore.
Expand Down
2 changes: 0 additions & 2 deletions Source/Tools/LinuxEmulation/LinuxSyscalls/x64/Syscalls.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ struct InternalThreadState;
}

namespace FEX::HLE::x64 {
#include "SyscallsEnum.h"

class x64SyscallHandler final : public FEX::HLE::SyscallHandler {
public:
x64SyscallHandler(FEXCore::Context::Context *ctx, FEX::HLE::SignalDelegator *_SignalDelegation);
Expand Down
3 changes: 2 additions & 1 deletion Source/Tools/LinuxEmulation/LinuxSyscalls/x64/SyscallsEnum.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ tags: LinuxSyscalls|syscalls-x86-64
*/
#pragma once

namespace FEX::HLE::x64 {
///< Enum containing all x86-64 linux syscalls for the guest kernel version
enum Syscalls_x64 {
SYSCALL_x64_read = 0,
Expand Down Expand Up @@ -479,4 +480,4 @@ enum Syscalls_x64 {
SYSCALL_x64_futex_time64 = ~0,
SYSCALL_x64_sched_rr_get_interval_time64 = ~0,
};

}
Loading