diff --git a/FEXCore/Source/Interface/Config/Config.json.in b/FEXCore/Source/Interface/Config/Config.json.in index ccde0a8ada..7ea15f2761 100644 --- a/FEXCore/Source/Interface/Config/Config.json.in +++ b/FEXCore/Source/Interface/Config/Config.json.in @@ -517,6 +517,13 @@ "Desc": [ "Override for a FEXServer socket path. Only useful for chroots." ] + }, + "NeedsSeccomp": { + "Type": "bool", + "Default": "false", + "Desc": [ + "Disables inline syscalls in order to support seccomp handling" + ] } } }, diff --git a/Source/Tools/FEXLoader/FEXLoader.cpp b/Source/Tools/FEXLoader/FEXLoader.cpp index d16844ef42..8a5847f5fc 100644 --- a/Source/Tools/FEXLoader/FEXLoader.cpp +++ b/Source/Tools/FEXLoader/FEXLoader.cpp @@ -271,6 +271,7 @@ int main(int argc, char** argv, char** const envp) { ExecutedWithFD = getauxval(AT_EXECFD) != 0; int FEXFD {StealFEXFDFromEnv("FEX_EXECVEFD")}; + int FEXSeccompFD {StealFEXFDFromEnv("FEX_SECCOMPFD")}; LogMan::Throw::InstallHandler(AssertHandler); LogMan::Msg::InstallHandler(MsgHandler); @@ -517,6 +518,8 @@ int main(int argc, char** argv, char** const envp) { CTX->AppendThunkDefinitions(FEX::VDSO::GetVDSOThunkDefinitions()); SignalDelegation->SetVDSOSigReturn(); + SyscallHandler->DeserializeSeccompFD(ParentThread, FEXSeccompFD); + FEXCore::Context::ExitReason ShutdownReason = FEXCore::Context::ExitReason::EXIT_SHUTDOWN; // There might already be an exit handler, leave it installed diff --git a/Source/Tools/LinuxEmulation/CMakeLists.txt b/Source/Tools/LinuxEmulation/CMakeLists.txt index 2d9ef28222..99005c166f 100644 --- a/Source/Tools/LinuxEmulation/CMakeLists.txt +++ b/Source/Tools/LinuxEmulation/CMakeLists.txt @@ -8,6 +8,9 @@ set (SRCS LinuxSyscalls/FileManagement.cpp LinuxSyscalls/LinuxAllocator.cpp LinuxSyscalls/NetStream.cpp + LinuxSyscalls/Seccomp/SeccompEmulator.cpp + LinuxSyscalls/Seccomp/BPFEmitter.cpp + LinuxSyscalls/Seccomp/Dumper.cpp LinuxSyscalls/SignalDelegator.cpp LinuxSyscalls/Syscalls.cpp LinuxSyscalls/SyscallsSMCTracking.cpp diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.cpp new file mode 100644 index 0000000000..75258a2f5e --- /dev/null +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.cpp @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: LinuxSyscalls|syscalls-shared +$end_info$ +*/ + +#include "LinuxSyscalls/Seccomp/BPFEmitter.h" +#include "LinuxSyscalls/Seccomp/SeccompEmulator.h" + +#include +#include +#include + +#define VALIDATE(cond) \ + do { \ + if (!(cond)) { \ + RETURN_ERROR(-EINVAL) \ + } \ + } while (0) +namespace FEX::HLE { +template +uint64_t BPFEmitter::HandleLoad(uint32_t BPFIP, const sock_filter* Inst) { + VALIDATE(BPF_SIZE(Inst->code) == BPF_W); + [[maybe_unused]] size_t OpSize {}; + + const auto DestReg = BPF_CLASS(Inst->code) == BPF_LD ? REG_A : REG_X; + + switch (BPF_MODE(Inst->code)) { + case BPF_IMM: { + auto Const = ConstPool.try_emplace(Inst->k, ARMEmitter::ForwardLabel {}); + EMIT_INST(ldr(DestReg, &Const.first->second)); + break; + } + case BPF_ABS: { + // ABS has some restrictions + // - Must be 4-byte aligned + // - Must be less than the size of seccomp_data + const auto Offset = Inst->k; + + // Need to be 4-byte aligned. + VALIDATE((Offset & 0b11) == 0); + // Ensure accessing inside of seccomp_data. + VALIDATE(Offset < sizeof(seccomp_data)); + + EMIT_INST(ldr(DestReg, REG_SECCOMP_DATA, Offset)); + break; + } + case BPF_MEM: + // Must be smaller than scratch space size. + VALIDATE(Inst->k < 16); + + EMIT_INST(ldr(DestReg, REG_SECCOMP_DATA, offsetof(WorkingBuffer, ScratchMemory[Inst->k]))); + break; + case BPF_LEN: + // Just returns the length of seccomp_data. + EMIT_INST(movz(DestReg, sizeof(seccomp_data))); + break; + case BPF_IND: + case BPF_MSH: + default: RETURN_ERROR(-EINVAL); // Unsupported + } + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleStore(uint32_t BPFIP, const sock_filter* Inst) { + VALIDATE(BPF_SIZE(Inst->code) == BPF_W); + + [[maybe_unused]] size_t OpSize {}; + + const auto SrcReg = BPF_CLASS(Inst->code) == BPF_LD ? REG_A : REG_X; + // Must be smaller than scratch space size. + VALIDATE(Inst->k < 16); + + EMIT_INST(str(SrcReg, REG_SECCOMP_DATA, offsetof(WorkingBuffer, ScratchMemory[Inst->k]))); + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleALU(uint32_t BPFIP, const sock_filter* Inst) { + [[maybe_unused]] size_t OpSize {}; + const auto SrcType = BPF_SRC(Inst->code); + const auto Op = BPF_OP(Inst->code); + + switch (Op) { + case BPF_ADD: + case BPF_SUB: + case BPF_MUL: + case BPF_DIV: + case BPF_OR: + case BPF_AND: + case BPF_LSH: + case BPF_RSH: + case BPF_MOD: + case BPF_XOR: { + auto SrcReg = REG_X; + if (SrcType == BPF_K) { + SrcReg = REG_TMP; + auto Const = ConstPool.try_emplace(Inst->k, ARMEmitter::ForwardLabel {}); + EMIT_INST(ldr(SrcReg, &Const.first->second)); + } + + switch (Op) { + case BPF_ADD: EMIT_INST(add(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_SUB: EMIT_INST(sub(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_MUL: EMIT_INST(mul(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_DIV: + // Specifically unsigned. + EMIT_INST(udiv(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); + break; + case BPF_OR: EMIT_INST(orr(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_AND: EMIT_INST(and_(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_LSH: EMIT_INST(lslv(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_RSH: EMIT_INST(lsrv(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + case BPF_MOD: + // Specifically unsigned. + EMIT_INST(udiv(ARMEmitter::Size::i32Bit, REG_TMP2, REG_A, SrcReg)); + EMIT_INST(msub(ARMEmitter::Size::i32Bit, REG_A, REG_TMP2, SrcReg, REG_A)); + break; + case BPF_XOR: EMIT_INST(eor(ARMEmitter::Size::i32Bit, REG_A, REG_A, SrcReg)); break; + default: RETURN_ERROR(-EINVAL); + } + + break; + } + case BPF_NEG: + // Only BPF_K supported on NEG. + VALIDATE(SrcType == BPF_K); + + EMIT_INST(neg(ARMEmitter::Size::i32Bit, REG_A, REG_A)); + break; + + default: RETURN_ERROR(-EINVAL); + } + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleJmp(uint32_t BPFIP, uint32_t NumInst, const sock_filter* Inst) { + [[maybe_unused]] size_t OpSize {}; + const auto SrcType = BPF_SRC(Inst->code); + const auto Op = BPF_OP(Inst->code); + + switch (Op) { + case BPF_JA: { + // Only BPF_K supported on JA. + VALIDATE(SrcType == BPF_K); + + // BPF IP register is effectively only 32-bit. Treat k constant like a signed integer. + // This allows it to jump anywhere in the program. + // But! Loops are EXPLICITLY disallowed inside of BPF programs. + // This is to prevent DOS style attacks through BPF programs. + uint64_t Target = BPFIP + Inst->k + 1; + // Must not jump past the end. + VALIDATE(Target < NumInst); + + fextl::unordered_map::iterator TargetLabel {}; + + if constexpr (!CalculateSize) { + TargetLabel = JumpLabels.try_emplace(Target, ARMEmitter::ForwardLabel {}).first; + } + + EMIT_INST(b(&TargetLabel->second)); + break; + } + case BPF_JEQ: + case BPF_JGT: + case BPF_JGE: + case BPF_JSET: { + auto CompareSrcReg = REG_X; + if (SrcType == BPF_K) { + CompareSrcReg = REG_TMP; + auto Const = ConstPool.try_emplace(Inst->k, ARMEmitter::ForwardLabel {}); + EMIT_INST(ldr(CompareSrcReg, &Const.first->second)); + } + uint32_t TargetTrue = BPFIP + Inst->jt + 1; + uint32_t TargetFalse = BPFIP + Inst->jf + 1; + + // Must not jump past the end. + VALIDATE(TargetTrue < NumInst && TargetFalse < NumInst); + + ARMEmitter::Condition CompareResultOp; + if (Op == BPF_JEQ) { + CompareResultOp = ARMEmitter::Condition::CC_EQ; + EMIT_INST(cmp(ARMEmitter::Size::i32Bit, REG_A, CompareSrcReg)); + } else if (Op == BPF_JGT) { + CompareResultOp = ARMEmitter::Condition::CC_HI; + EMIT_INST(cmp(ARMEmitter::Size::i32Bit, REG_A, CompareSrcReg)); + } else if (Op == BPF_JGE) { + CompareResultOp = ARMEmitter::Condition::CC_HS; + EMIT_INST(cmp(ARMEmitter::Size::i32Bit, REG_A, CompareSrcReg)); + } else if (Op == BPF_JSET) { + CompareResultOp = ARMEmitter::Condition::CC_NE; + EMIT_INST(tst(ARMEmitter::Size::i32Bit, REG_A, CompareSrcReg)); + } else { + RETURN_ERROR(-EINVAL); + } + + fextl::unordered_map::iterator TargetTrueLabel {}; + fextl::unordered_map::iterator TargetFalseLabel {}; + + if constexpr (!CalculateSize) { + TargetTrueLabel = JumpLabels.try_emplace(TargetTrue, ARMEmitter::ForwardLabel {}).first; + TargetFalseLabel = JumpLabels.try_emplace(TargetFalse, ARMEmitter::ForwardLabel {}).first; + } + + EMIT_INST(b(CompareResultOp, &TargetTrueLabel->second)); + EMIT_INST(b(&TargetFalseLabel->second)); + break; + } + default: RETURN_ERROR(-EINVAL); // Unknown jump type + } + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleRet(uint32_t BPFIP, const sock_filter* Inst) { + [[maybe_unused]] size_t OpSize {}; + const auto RValSrc = BPF_RVAL(Inst->code); + switch (RValSrc) { + case BPF_K: { + auto Const = ConstPool.try_emplace(Inst->k, ARMEmitter::ForwardLabel {}); + EMIT_INST(ldr(ARMEmitter::WReg::w0, &Const.first->second)); + break; + } + case BPF_X: EMIT_INST(mov(ARMEmitter::WReg::w0, REG_X)); break; + case BPF_A: + // w0 is already REG_A + static_assert(REG_A == ARMEmitter::WReg::w0, "This is expected to be the same"); + break; + default: RETURN_ERROR(-EINVAL); + } + + EMIT_INST(ret()); + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleMisc(uint32_t BPFIP, const sock_filter* Inst) { + [[maybe_unused]] size_t OpSize {}; + const auto MiscOp = BPF_MISCOP(Inst->code); + switch (MiscOp) { + case BPF_TAX: EMIT_INST(mov(REG_X, REG_A)); break; + case BPF_TXA: EMIT_INST(mov(REG_A, REG_X)); break; + default: RETURN_ERROR(-EINVAL) // Unsupported misc operation. + } + + RETURN_SUCCESS(); +} + +template +uint64_t BPFEmitter::HandleEmission(uint32_t flags, const sock_fprog* prog) { + constexpr Pred PredFunc; + uint64_t CalculatedSize {}; + + for (uint32_t i = 0; i < prog->len; ++i) { + if constexpr (!CalculateSize) { + auto jump_label = JumpLabels.find(i); + if (jump_label != JumpLabels.end()) { + Bind(&jump_label->second); + } + } + + bool HadError {}; + uint64_t Result {}; + + const sock_filter* Inst = &prog->filter[i]; + const uint16_t Code = Inst->code; + const uint16_t Class = BPF_CLASS(Code); + switch (Class) { + case BPF_LD: + case BPF_LDX: { + Result = HandleLoad(i, Inst); + break; + } + case BPF_ST: + case BPF_STX: { + Result = HandleStore(i, Inst); + break; + } + case BPF_ALU: { + Result = HandleALU(i, Inst); + break; + } + case BPF_JMP: { + Result = HandleJmp(i, prog->len, Inst); + break; + } + case BPF_RET: { + Result = HandleRet(i, Inst); + break; + } + case BPF_MISC: { + Result = HandleMisc(i, Inst); + break; + } + default: + // We handle all instruction classes. + FEX_UNREACHABLE; + } + + HadError = PredFunc(Result); + + if constexpr (CalculateSize) { + CalculatedSize += Result; + } + + if (HadError) { + if constexpr (!CalculateSize) { + // Had error, early return and free the memory. + FEXCore::Allocator::munmap(GetBufferBase(), FuncSize); + } + return Result; + } + } + + if constexpr (CalculateSize) { + // Add the constant pool size. + CalculatedSize += ConstPool.size() * 4; + + // Size calculation could have added constants and jump labels. Erase them now. + ConstPool.clear(); + JumpLabels.clear(); + + return CalculatedSize; + } + + return 0; +} + +uint64_t BPFEmitter::JITFilter(uint32_t flags, const sock_fprog* prog) { + FuncSize = HandleEmission(flags, prog); + + if (FuncSize == ~0ULL) { + // Buffer size calculation found invalid code. + return -EINVAL; + } + + SetBuffer((uint8_t*)FEXCore::Allocator::mmap(nullptr, FuncSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), FuncSize); + + const auto CodeBegin = GetCursorAddress(); + + uint64_t Result = HandleEmission(flags, prog); + + if (Result != 0) { + // Had error, early return and free the memory. + FEXCore::Allocator::munmap(GetBufferBase(), FuncSize); + return Result; + } + + const uint64_t CodeOnlySize = GetCursorAddress() - CodeBegin; + + // Emit the constant pool. + Align(); + for (auto& Const : ConstPool) { + Bind(&Const.second); + dc32(Const.first); + } + + ClearICache(CodeBegin, CodeOnlySize); + ::mprotect(CodeBegin, AllocationSize(), PROT_READ | PROT_EXEC); + Func = CodeBegin; + + if constexpr (false) { + // Useful for debugging seccomp filters. + LogMan::Msg::DFmt("JITFilter: disas 0x{:x},+{}", (uint64_t)CodeBegin, CodeOnlySize); + } + + ConstPool.clear(); + JumpLabels.clear(); + return 0; +} + + +} // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.h new file mode 100644 index 0000000000..38ae9142e7 --- /dev/null +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/BPFEmitter.h @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: LinuxSyscalls|syscalls-shared +$end_info$ +*/ +#pragma once + +#include + +#include + +#include +#include + +struct sock_fprog; +struct sock_filter; + +namespace FEX::HLE { +class BPFEmitter final : public ARMEmitter::Emitter { +public: + struct WorkingBuffer { + struct seccomp_data Data; + uint32_t ScratchMemory[BPF_MEMWORDS]; // Defined as 16 words. + }; + + BPFEmitter() = default; + + uint64_t JITFilter(uint32_t flags, const sock_fprog* prog); + void* GetFunc() const { + return Func; + } + + size_t AllocationSize() const { + return FuncSize; + } + +private: + template + uint64_t HandleLoad(uint32_t BPFIP, const sock_filter* Inst); + template + uint64_t HandleStore(uint32_t BPFIP, const sock_filter* Inst); + template + uint64_t HandleALU(uint32_t BPFIP, const sock_filter* Inst); + template + uint64_t HandleJmp(uint32_t BPFIP, uint32_t NumInst, const sock_filter* Inst); + template + uint64_t HandleRet(uint32_t BPFIP, const sock_filter* Inst); + template + uint64_t HandleMisc(uint32_t BPFIP, const sock_filter* Inst); + +#define EMIT_INST(x) \ + do { \ + if constexpr (CalculateSize) { \ + OpSize += 4; \ + } else { \ + x; \ + } \ + } while (0) + +#define RETURN_ERROR(x) \ + if constexpr (CalculateSize) { \ + return ~0ULL; \ + } else { \ + static_assert(x == -EINVAL, "Early return error evaluation only supports EINVAL"); \ + return x; \ + } + +#define RETURN_SUCCESS() \ + do { \ + if constexpr (CalculateSize) { \ + return OpSize; \ + } else { \ + return 0; \ + } \ + } while (0) + + using SizeErrorCheck = decltype([](uint64_t Result) -> bool { return Result == ~0ULL; }); + using EmissionErrorCheck = decltype([](uint64_t Result) { return Result != 0; }); + + template + uint64_t HandleEmission(uint32_t flags, const sock_fprog* prog); + + // Register selection comes from function signature. + constexpr static auto REG_A = ARMEmitter::WReg::w0; + constexpr static auto REG_X = ARMEmitter::WReg::w1; + constexpr static auto REG_TMP = ARMEmitter::WReg::w2; + constexpr static auto REG_TMP2 = ARMEmitter::WReg::w3; + constexpr static auto REG_SECCOMP_DATA = ARMEmitter::XReg::x4; + fextl::unordered_map JumpLabels; + fextl::unordered_map ConstPool; + + void* Func; + size_t FuncSize; +}; + + +} // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/Dumper.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/Dumper.cpp new file mode 100644 index 0000000000..bf2a3764cb --- /dev/null +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/Dumper.cpp @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: LinuxSyscalls|syscalls-shared +$end_info$ +*/ + +#include "LinuxSyscalls/Seccomp/SeccompEmulator.h" + +#include +#include +#include + +namespace FEX::HLE { +void SeccompEmulator::DumpProgram(const sock_fprog* prog) { + auto Parse_Class_LD = [](uint32_t BPFIP, const sock_filter* Inst) { + auto DestName = [](sock_filter const* Inst) { + if (BPF_CLASS(Inst->code) == BPF_LD) { + return "A"; + } else { + return "X"; + } + }; + + auto AccessSize = [](sock_filter const* Inst) { + switch (BPF_SIZE(Inst->code)) { + case BPF_W: return 32; + case BPF_H: return 16; + case BPF_B: return 8; + case 0x18: /* BPF_DW */ return 64; + } + return 0; + }; + + auto ModeType = [](sock_filter const* Inst) { + switch (BPF_MODE(Inst->code)) { + case BPF_IMM: return "IMM"; + case BPF_ABS: return "ABS"; + case BPF_IND: return "IND"; + case BPF_MEM: return "MEM"; + case BPF_LEN: return "LEN"; + case BPF_MSH: return "MSH"; + } + return "Unknown"; + }; + + auto LoadName = [](sock_filter const* Inst) { + using namespace std::string_view_literals; + switch (BPF_MODE(Inst->code)) { + case BPF_IMM: return fextl::fmt::format("#{}", Inst->k); + case BPF_ABS: return fextl::fmt::format("seccomp_data + #{}", Inst->k); + case BPF_IND: return fextl::fmt::format("Ind[X+#{}]", Inst->k); + case BPF_MEM: return fextl::fmt::format("Mem[#{}]", Inst->k); + case BPF_LEN: return fextl::fmt::format("len"); + case BPF_MSH: return fextl::fmt::format("msh"); + } + return fextl::fmt::format("Unknown"); + }; + + LogMan::Msg::IFmt("0x{:04x}: {} <- LD.{} {} {}", BPFIP, DestName(Inst), AccessSize(Inst), ModeType(Inst), LoadName(Inst)); + }; + + auto Parse_Class_ST = [](uint32_t BPFIP, const sock_filter* Inst) { + auto DestName = [](sock_filter const* Inst) { + if (BPF_CLASS(Inst->code) == BPF_ST) { + return "A"; + } else { + return "X"; + } + }; + + LogMan::Msg::IFmt("0x{:04x}: Mem[{}] <- ST.{}", BPFIP, Inst->k, DestName(Inst)); + }; + + auto Parse_Class_ALU = [](uint32_t BPFIP, const sock_filter* Inst) { + auto GetOp = [](sock_filter const* Inst) { + const auto Op = BPF_OP(Inst->code); + + switch (Op) { + case BPF_ADD: return "ADD"; + case BPF_SUB: return "SUB"; + case BPF_MUL: return "MUL"; + case BPF_DIV: return "DIV"; + case BPF_OR: return "OR"; + case BPF_AND: return "AND"; + case BPF_LSH: return "LSH"; + case BPF_RSH: return "RSH"; + case BPF_MOD: return "MOD"; + case BPF_XOR: return "XOR"; + case BPF_NEG: return "NEG"; + default: return "Unknown"; + } + }; + + auto GetSrc = [](sock_filter const* Inst) { + switch (BPF_SRC(Inst->code)) { + case BPF_K: return fextl::fmt::format("0x{:x}", Inst->k); + case BPF_X: return fextl::fmt::format(""); + } + return fextl::fmt::format("Unknown"); + }; + + LogMan::Msg::IFmt("0x{:04x}: {} , {}", BPFIP, GetOp(Inst), GetSrc(Inst)); + }; + + auto Parse_Class_JMP = [](uint32_t BPFIP, const sock_filter* Inst) { + auto GetOp = [](sock_filter const* Inst) { + switch (BPF_OP(Inst->code)) { + case BPF_JA: return "a"; + case BPF_JEQ: return "eq"; + case BPF_JGT: return "gt"; + case BPF_JGE: return "ge"; + case BPF_JSET: return "set"; + } + return "Unknown"; + }; + + auto GetSrc = [](sock_filter const* Inst) { + switch (BPF_SRC(Inst->code)) { + case BPF_K: return fextl::fmt::format("0x{:x}", Inst->k); + case BPF_X: return fextl::fmt::format(""); + } + return fextl::fmt::format("Unknown"); + }; + + LogMan::Msg::IFmt("0x{:04x}: JMP.{} {}, +{} (#0x{:x}), +{} (#0x{:x})", BPFIP, GetOp(Inst), GetSrc(Inst), Inst->jt, BPFIP + Inst->jt + 1, + Inst->jf, BPFIP + Inst->jf + 1); + }; + + auto Parse_Class_RET = [](uint32_t BPFIP, const sock_filter* Inst) { + auto GetRetValue = [](sock_filter const* Inst) { + switch (BPF_RVAL(Inst->code)) { + case BPF_K: { + uint32_t RetData = Inst->k & SECCOMP_RET_DATA; + switch (Inst->k & SECCOMP_RET_ACTION_FULL) { + case SECCOMP_RET_KILL_PROCESS: return fextl::fmt::format("KILL_PROCESS.{}", RetData); + case SECCOMP_RET_KILL_THREAD: return fextl::fmt::format("KILL_THREAD.{}", RetData); + case SECCOMP_RET_TRAP: return fextl::fmt::format("TRAP.{}", RetData); + case SECCOMP_RET_ERRNO: return fextl::fmt::format("ERRNO.{}", RetData); + case SECCOMP_RET_USER_NOTIF: return fextl::fmt::format("USER_NOTIF.{}", RetData); + case SECCOMP_RET_TRACE: return fextl::fmt::format("TRACE.{}", RetData); + case SECCOMP_RET_LOG: return fextl::fmt::format("LOG.{}", RetData); + case SECCOMP_RET_ALLOW: return fextl::fmt::format("ALLOW.{}", RetData); + default: break; + } + return fextl::fmt::format(".{}", RetData); + } + case BPF_X: return fextl::fmt::format(""); + case BPF_A: return fextl::fmt::format(""); + } + + return fextl::fmt::format("Unknown"); + }; + + LogMan::Msg::IFmt("0x{:04x}: RET {}", BPFIP, GetRetValue(Inst)); + }; + + auto Parse_Class_MISC = [](uint32_t BPFIP, const sock_filter* Inst) { + const auto MiscOp = BPF_MISCOP(Inst->code); + switch (MiscOp) { + case BPF_TAX: LogMan::Msg::IFmt("0x{:04x}: TAX", BPFIP); break; + case BPF_TXA: LogMan::Msg::IFmt("0x{:04x}: TXA", BPFIP); break; + default: LogMan::Msg::IFmt("0x{:04x}: Misc: Unknown", BPFIP); break; + }; + }; + + LogMan::Msg::IFmt("BPF program: 0x{:x} instructions", prog->len); + + for (size_t i = 0; i < prog->len; ++i) { + const sock_filter* Inst = &prog->filter[i]; + const uint16_t Code = Inst->code; + const uint16_t Class = BPF_CLASS(Code); + switch (Class) { + case BPF_LD: + case BPF_LDX: Parse_Class_LD(i, Inst); break; + case BPF_ST: + case BPF_STX: Parse_Class_ST(i, Inst); break; + case BPF_ALU: Parse_Class_ALU(i, Inst); break; + case BPF_JMP: Parse_Class_JMP(i, Inst); break; + case BPF_RET: Parse_Class_RET(i, Inst); break; + case BPF_MISC: Parse_Class_MISC(i, Inst); break; + } + } +} +} // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.cpp new file mode 100644 index 0000000000..f627fe29fb --- /dev/null +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.cpp @@ -0,0 +1,718 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: LinuxSyscalls|syscalls-shared +$end_info$ +*/ + +#include "LinuxSyscalls/Seccomp/BPFEmitter.h" +#include "LinuxSyscalls/Seccomp/SeccompEmulator.h" + +#include "LinuxSyscalls/x32/Syscalls.h" +#include "LinuxSyscalls/x64/Syscalls.h" +#include "LinuxSyscalls/SignalDelegator.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// seccomp +// +// global +// - kcmp - pass +// - mode_strict_support - pass +// - mode_strict_cannot_call_prctl - pass +// - no_new_privs_support - pass +// - mode_filter_support - pass +// - mode_filter_without_nnp - pass +// - filter_size_limits - pass +// - filter_chain_limits - pass +// - mode_filter_cannot_move_to_strict - pass +// - mode_filter_get_seccomp - pass +// - ALLOW_all - pass +// - empty_prog - pass +// - log_all - pass +// - unknown_ret_is_kill_inside - pass +// - unknown_ret_is_kill_above_allow - pass +// - KILL_all - pass +// - KILL_one - pass +// - KILL_one_arg_one - pass +// - KILL_one_arg_six - pass +// - KILL_thread - FAIL (unrelated to bpf) +// - KILL_process - FAIL (unrelated to bpf) +// - KILL_unknown - FAIL (unrelated to bpf) +// - arg_out_of_range - pass +// - ERRNO_valid - pass +// - ERRNO_zero - pass +// - ERRNO_capped - pass +// - ERRNO_order - pass +// - seccomp_syscall - pass +// - seccomp_syscall_mode_lock - pass +// - detect_seccomp_filter_flags - pass +// - TSYNC_first - pass +// - syscall_restart - FAIL (PTRACE) +// - filter_flag_log - pass +// - get_action_avail - FAIL (ptrace and user-notif) +// TSYNC +// - siblings_fail_prctl - pass +// - two_siblings_with_ancestor - FAIL (kill-thread not working quite right) +// - two_sibling_want_nnp - pass +// - two_siblings_with_one_divergence - pass +// - two_siblings_with_one_divergence_no_tid_in_err - pass +// - two_siblings_not_under_filter - FAIL (kill-thread not working quite right) +// - two_siblings_with_no_filter - FAIL (kill-thread not working quite right) +// +// user-notif stuff +// - get_metadata - SKIP (Needs root) +// - user_notification_basic - FAIL (user-notif) +// - user_notification_with_tsync - FAIL (user-notif) +// - user_notification_kill_in_middle - FAIL (user-notif) +// - user_notification_signal - FAIL (user-notif) +// - user_notification_closed_listener - FAIL (user-notif) +// - user_notification_child_pid_ns - FAIL (user-notif) +// - user_notification_sibling_pid_ns - FAIL (user-notif) +// - user_notification_fault_recv - FAIL (user-notif) +// - seccomp_get_notif_sizes - pass +// - user_notification_continue - FAIL (user-notif) +// - user_notification_filter_empty - FAIL (user-notif) +// - user_notification_filter_empty_threaded - FAIL (user-notif) +// - user_notification_addfd - FAIL (user-notif) +// - user_notification_addfd_rlimit - FAIL (user-notif) +// - user_notification_sync - FAIL (user-notif) +// - user_notification_fifo - FAIL (user-notif) +// - user_notification_wait_killable_pre_notification - FAIL (user-notif) +// - user_notification_wait_killable - FAIL (user-notif) +// - user_notification_wait_killable_fatal - FAIL (user-notif) +// +// O_SUSPEND_SECCOMP +// - setoptions - FAIL (ptrace) +// - seize - FAIL (ptrace) +// TRAP +// - dfl - pass +// - ign - pass +// - handler - pass +// +// precedence +// - allow_ok - pass +// - kill_is_highest - pass +// - kill_is_highest_in_any_order - pass +// - trap_is_second - pass +// - trap_is_second_in_any_order - pass +// - errno_is_third - pass +// - errno_is_third_in_any_order - pass +// - trace_is_fourth - pass +// - trace_is_fourth_in_any_order - pass +// - log_is_fifth - pass +// - log_is_fifth_in_any_order - pass +// +// TRACE_poke +// - ptrace unsupported +// TRACE_syscall +// - ptrace unsupported + +namespace FEX::HLE { +uint64_t SeccompEmulator::Handle(FEXCore::Core::CpuStateFrame* Frame, uint32_t Op, uint32_t flags, void* arg) { + // If seccomp isn't enabled then say so. + if (!NeedsSeccomp) { + return -EINVAL; + } + + switch (Op) { + case SECCOMP_SET_MODE_STRICT: return SetModeStrict(Frame, flags, arg); + case SECCOMP_SET_MODE_FILTER: return SetModeFilter(Frame, flags, static_cast(arg)); + case SECCOMP_GET_ACTION_AVAIL: return GetActionAvail(flags, static_cast(arg)); + case SECCOMP_GET_NOTIF_SIZES: return GetNotifSizes(flags, static_cast(arg)); + default: + // operation is unknown or is not supported by this kernel version or configuration. + return -EINVAL; + } +} + +// Equivalent to prctl(PR_GET_SECCOMP) +uint64_t SeccompEmulator::GetSeccomp(FEXCore::Core::CpuStateFrame* Frame) { + // If seccomp isn't enabled then say so. + if (!NeedsSeccomp) { + return -EINVAL; + } + + auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + return Thread->SeccompMode; +} + +void SeccompEmulator::InheritSeccompFilters(FEX::HLE::ThreadStateObject* Parent, FEX::HLE::ThreadStateObject* Child) { + // Don't interrupt me while I'm copying. + auto lk = FEXCore::MaskSignalsAndLockMutex(FilterMutex); + + Child->Filters.resize(Parent->Filters.size()); + + for (size_t i = 0; i < Child->Filters.size(); ++i) { + auto& ParentFilter = Parent->Filters[i]; + auto& ChildFilter = Child->Filters[i]; + ChildFilter = ParentFilter; + std::atomic_ref(ParentFilter->RefCount)++; + } + + // Copy the operating mode. + Child->SeccompMode = Parent->SeccompMode; +} + +void SeccompEmulator::FreeSeccompFilters(FEX::HLE::ThreadStateObject* Thread) { + // Don't talk to me when I'm busy deleting myself. + auto lk = FEXCore::MaskSignalsAndLockMutex(FilterMutex); + + bool HasFiltersToDelete {}; + for (auto& Filter : Thread->Filters) { + auto RefCount = std::atomic_ref(Filter->RefCount).fetch_sub(1); + + if (RefCount == 1) { + HasFiltersToDelete = true; + } + } + Thread->Filters.clear(); + + if (HasFiltersToDelete) { + // Garbage collect filters + std::erase_if(Filters, [](auto& Filter) { + if (std::atomic_ref(Filter.RefCount).load(std::memory_order_relaxed) != 0) { + return false; + } + + FEXCore::Allocator::munmap(reinterpret_cast(Filter.Func), Filter.MappedSize); + return true; + }); + } +} + +struct SerializedFilter { + size_t CodeSize; + uint32_t FilterInstructions; + bool ShouldLog; + char Code[]; +}; + +struct SerializationHeader { + size_t NumberOfFilters; + uint32_t SeccompMode; + SerializedFilter Filters[]; +}; + +std::optional SeccompEmulator::SerializeFilters(FEXCore::Core::CpuStateFrame* Frame) { + auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + if (Thread->SeccompMode == SECCOMP_MODE_DISABLED) { + // Didn't have seccomp enabled. + return std::nullopt; + } + + int FD = memfd_create("seccomp_filters", MFD_ALLOW_SEALING); + if (FD == -1) { + // Couldn't create memfd + LogMan::Msg::EFmt("Couldn't create seccomp filter FD!"); + return -1; + } + + SerializationHeader Header { + .NumberOfFilters = Thread->Filters.size(), + .SeccompMode = Thread->SeccompMode, + }; + + int Res = write(FD, &Header, sizeof(Header)); + if (Res == -1) { + LogMan::Msg::EFmt("Couldn't write header!"); + close(FD); + return -1; + } + + for (auto& Filter : Thread->Filters) { + SerializedFilter SFilter { + .CodeSize = Filter->MappedSize, + .FilterInstructions = Filter->FilterInstructions, + .ShouldLog = Filter->ShouldLog, + }; + + Res = write(FD, &SFilter, sizeof(SFilter)); + if (Res == -1) { + LogMan::Msg::EFmt("Couldn't write filter header!"); + close(FD); + return -1; + } + + Res = write(FD, (const void*)Filter->Func, Filter->MappedSize); + if (Res == -1) { + LogMan::Msg::EFmt("Couldn't write filter!"); + close(FD); + return -1; + } + } + + // Reset FD to start. + lseek(FD, 0, SEEK_SET); + + // Seal everything about this FD. + fcntl(FD, F_ADD_SEALS, F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); + + return FD; +} + +void SeccompEmulator::DeserializeFilters(FEXCore::Core::CpuStateFrame* Frame, int FD) { + auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + + SerializationHeader Header; + int Res = read(FD, &Header, sizeof(Header)); + if (Res == -1 || Res != sizeof(Header)) { + LogMan::Msg::EFmt("Couldn't read Seccomp header!"); + close(FD); + return; + } + + for (size_t i = 0; i < Header.NumberOfFilters; ++i) { + SerializedFilter SFilter; + + Res = read(FD, &SFilter, sizeof(SFilter)); + if (Res == -1 || Res != sizeof(SFilter)) { + LogMan::Msg::EFmt("Couldn't read Seccomp Filter header!"); + close(FD); + return; + } + auto Ptr = FEXCore::Allocator::mmap(nullptr, SFilter.CodeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (Ptr == (void*)~0ULL) { + LogMan::Msg::EFmt("Couldn't allocate ptr for filter!"); + close(FD); + return; + } + + Res = read(FD, Ptr, SFilter.CodeSize); + if (Res == -1 || Res != SFilter.CodeSize) { + LogMan::Msg::EFmt("Couldn't read Seccomp Filter code!"); + close(FD); + return; + } + + ::mprotect(Ptr, SFilter.CodeSize, PROT_READ | PROT_EXEC); + + auto& it = Filters.emplace_back(FilterInformation {(FilterFunc)Ptr, 1, SFilter.CodeSize, SFilter.FilterInstructions, SFilter.ShouldLog}); + TotalFilterInstructions += SFilter.FilterInstructions; + + // Append the filter to the thread. + Thread->Filters.emplace_back(&it); + } + + Thread->SeccompMode = Header.SeccompMode; + close(FD); +} + +SeccompEmulator::ExecuteFilterResult +SeccompEmulator::ExecuteFilter(FEXCore::Core::CpuStateFrame* Frame, uint64_t JITPC, FEXCore::HLE::SyscallArguments* Args) { + auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + + if (Thread->Filters.empty()) { + // Seccomp not installed. Allow it. + return {false, 0}; + } + + // Reconstruct the RIP from the JITPC. + const uint64_t RIP = Thread->Thread->CTX->RestoreRIPFromHostPC(Frame->Thread, JITPC); + + const auto Arch = Is64BitMode() ? AUDIT_ARCH_X86_64 : AUDIT_ARCH_I386; + bool ShouldLog {}; + uint32_t SeccompResult {}; + + { + BPFEmitter::WorkingBuffer Data { + .Data = + { + .nr = static_cast(Args->Argument[0]), + .arch = Arch, + .instruction_pointer = RIP, + .args = + { + Args->Argument[1], + Args->Argument[2], + Args->Argument[3], + Args->Argument[4], + Args->Argument[5], + Args->Argument[6], + }, + }, + }; + + bool HasResult {}; + // seccomp filters are executed from latest added to oldest. + for (auto it = Thread->Filters.rbegin(); it != Thread->Filters.rend(); ++it) { + // Explicitly zero scratch memory. + memset(&Data.ScratchMemory, 0, sizeof(Data.ScratchMemory)); + + uint32_t CurrentResult = (*it)->Func(0, 0, 0, 0, &Data); + + if (!HasResult) { + SeccompResult = CurrentResult; + ShouldLog = (*it)->ShouldLog; + HasResult = true; + continue; + } + + const int16_t CurrentAction = (CurrentResult & SECCOMP_RET_ACTION_FULL) >> 16; + const int16_t Action = (SeccompResult & SECCOMP_RET_ACTION_FULL) >> 16; + + // All actions are executed but the first highest precendent result is returned. + // Precedent order from highest priority to lowest: + // - SECCOMP_RET_KILL_PROCESS (0x8000, -32768) + // - SECCOMP_RET_KILL_THREAD (0x0000, 0) + // - SECCOMP_RET_TRAP (0x0003, 3) + // - SECCOMP_RET_ERRNO (0x0005, 5) + // - SECCOMP_RET_USER_NOTIF (0x7fc0, 32704) + // - SECCOMP_RET_TRACE (0x7ff0, 32752) + // - SECCOMP_RET_LOG (0x7ffc, 32764) + // - SECCOMP_RET_ALLOW (0x7fff, 32767) + if (CurrentAction < Action) { + SeccompResult = CurrentResult; + ShouldLog = (*it)->ShouldLog; + } + } + } + + const auto ActionMasked = SeccompResult & SECCOMP_RET_ACTION_FULL; + const auto DataMasked = SeccompResult & SECCOMP_RET_DATA; + + // Logging rules + // - Log if explicitly returning SECCOMP_RET_LOG + // - Log if the filter enabled the logging flag and the action is something other than SECCOMP_RET_ALLOW. + if ((ShouldLog && ActionMasked != SECCOMP_RET_ALLOW) || ActionMasked == SECCOMP_RET_LOG) { + int Signal = 0; + switch (ActionMasked) { + case SECCOMP_RET_KILL_PROCESS: + case SECCOMP_RET_KILL_THREAD: Signal = GetKillSignal(); break; + case SECCOMP_RET_TRAP: Signal = SIGSYS; break; + default: break; + } + + // With real secommp the logs go to dmesg. log through FEX since we can't use dmesg. + // ex: `[13572.669277] audit: type=1326 audit(1715469332.533:62): auid=1000 uid=1000 gid=1000 ses=2 subj=unconfined pid=52546 comm="seccomp_bpf" + // exe="/mnt/Work/Projects/work/linux/tools/testing/selftests/seccomp/seccomp_bpf" sig=0 arch=c000003e syscall=39 compat=0 ip=0x7d789352725d code=0x7ffc0000` + timespec tp {}; + clock_gettime(CLOCK_MONOTONIC, &tp); + LogMan::Msg::IFmt("audit: type={} audit({}.{:03}:{}): uid={} gid={} pid={} comm={} sig={} arch={:x} syscall={} ip=0x{:x} code=0x{:x}", + AUDIT_SECCOMP, tp.tv_sec, tp.tv_nsec / 1'000'000, AuditSerialIncrement(), ::getuid(), ::getgid(), ::getpid(), + Filename(), Signal, Arch, Args->Argument[0], RIP, SeccompResult); + } + + switch (ActionMasked) { + // Unknown actions behave like RET_KILL_PROCESS. + default: + case SECCOMP_RET_KILL_PROCESS: { + const int KillSignal = GetKillSignal(); + // Ignores signal handler and sigmask + uint64_t Mask = 1 << (KillSignal - 1); + SignalDelegation->GuestSigProcMask(Thread, SIG_UNBLOCK, &Mask, nullptr); + SignalDelegation->UninstallHostHandler(KillSignal); + kill(0, KillSignal); + break; + } + case SECCOMP_RET_KILL_THREAD: { + // Ignores signal handler and sigmask + uint64_t Mask = 1 << (SIGSYS - 1); + SignalDelegation->GuestSigProcMask(Thread, SIG_UNBLOCK, &Mask, nullptr); + SignalDelegation->UninstallHostHandler(SIGSYS); + tgkill(::getpid(), ::gettid(), SIGSYS); + break; + } + case SECCOMP_RET_TRAP: { + siginfo_t Info { + .si_signo = SIGSYS, + .si_errno = static_cast(DataMasked), + .si_code = 1, // SYS_SECCOMP + }; + + Info.si_call_addr = reinterpret_cast(RIP); + Info.si_syscall = Args->Argument[0]; + Info.si_arch = Arch; + + SignalDelegation->QueueSignal(::getpid(), ::gettid(), SIGSYS, &Info, true); + break; + } + case SECCOMP_RET_ERRNO: { + // errno return is clamped. + return {true, -(std::min(DataMasked, 4095))}; + } + case SECCOMP_RET_TRACE: { + // When no tracer attached, behave like RET_ERRNO returning ENOSYS. + // TODO: Implement once FEX supports tracing. + return {true, static_cast(-ENOSYS)}; + } + case SECCOMP_RET_USER_NOTIF: + case SECCOMP_RET_LOG: + case SECCOMP_RET_ALLOW: break; + } + + return {false, 0}; +} + +// Equivalent to seccomp(SECCOMP_SET_MODE_STRICT, ...); +uint64_t SeccompEmulator::SetModeStrict(FEXCore::Core::CpuStateFrame* Frame, uint32_t flags, const void* arg) { + const auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + + if (::prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 0) { + // The caller did not have the CAP_SYS_ADMIN capability in its user namespace, or had not set no_new_privs before using SECCOMP_SET_MODE_FILTER. + return -EACCES; + } + + if (flags != 0) { + // The specified flags are invalid for the given operation. + return -EINVAL; + } + + if (arg != nullptr) { + // The specified arg are invalid for the given operation. + return -EINVAL; + } + + if (Thread->SeccompMode == SECCOMP_MODE_FILTER) { + // Filter mode cannot move to strict + return -EINVAL; + } + +#define syscall_nr (offsetof(struct seccomp_data, nr)) +#define ALLOW_SYSCALL(name) \ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, FEX::HLE::x64::SYSCALL_x64_##name, 0, 1), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) +#define ALLOW_SYSCALL_x32(name) \ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, FEX::HLE::x32::SYSCALL_x86_##name, 0, 1), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) + + constexpr static struct sock_filter strict_filter_x64[] = { + // Load syscall number + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, syscall_nr), + + // Allow read, write, exit, exit_group, and sigreturn + ALLOW_SYSCALL(read), + ALLOW_SYSCALL(write), + ALLOW_SYSCALL(exit), + ALLOW_SYSCALL(exit_group), + ALLOW_SYSCALL(rt_sigreturn), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS), + }; + + constexpr static struct sock_filter strict_filter_x32[] = { + // Load syscall number + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, syscall_nr), + + // Allow read, write, exit, exit_group, and sigreturn + ALLOW_SYSCALL_x32(read), + ALLOW_SYSCALL_x32(write), + ALLOW_SYSCALL_x32(exit), + ALLOW_SYSCALL_x32(exit_group), + ALLOW_SYSCALL_x32(rt_sigreturn), + ALLOW_SYSCALL_x32(sigreturn), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS), + }; + + const sock_fprog prog_x64 { + .len = (unsigned short)(sizeof(strict_filter_x64) / sizeof(strict_filter_x64[0])), + .filter = const_cast(strict_filter_x64), + }; + + const sock_fprog prog_x32 { + .len = (unsigned short)(sizeof(strict_filter_x32) / sizeof(strict_filter_x32[0])), + .filter = const_cast(strict_filter_x32), + }; + CurrentKillSignal = SIGKILL; + const sock_fprog* prog = Is64BitMode() ? &prog_x64 : &prog_x32; + SetModeFilter(Frame, 0, prog); + Thread->SeccompMode = SECCOMP_MODE_STRICT; + + return 0; +} + +uint64_t SeccompEmulator::CanDoTSync(FEXCore::Core::CpuStateFrame* Frame) { + auto ParentThread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + auto Threads = SyscallHandler->TM.GetThreads(); + + for (auto& Thread : *Threads) { + if (Thread == ParentThread) { + // Skip same thread. + continue; + } + + if (Thread->SeccompMode == SECCOMP_MODE_DISABLED) { + // Threads which have seccomp disabled are safe to TSync + continue; + } + + if (Thread->SeccompMode != ParentThread->SeccompMode) { + /// If the seccomp mode differs between threads then it can't tsync. + /// Strict versus filter mode aren't tsync compatible. + return Thread->ThreadInfo.TID; + } + + if (Thread->Filters.size() != ParentThread->Filters.size()) { + // If the filter count doesn't even match then it can't tsync. + return Thread->ThreadInfo.TID; + } + + // Walk each filter and ensure the entry points are the same and in the same order. + for (size_t i = 0; i < ParentThread->Filters.size(); ++i) { + if (Thread->Filters[i]->Func != ParentThread->Filters[i]->Func) { + /// Entry point mismatch, not the same filter. + /// Not tsync compatible. + return Thread->ThreadInfo.TID; + } + } + } + + // Everything matched. tsync compatible! + return 0; +} + +void SeccompEmulator::TSyncFilters(FEXCore::Core::CpuStateFrame* Frame) { + auto ParentThread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + auto Threads = SyscallHandler->TM.GetThreads(); + + for (auto& Thread : *Threads) { + if (Thread == ParentThread) { + // Skip same thread. + continue; + } + + Thread->Filters.clear(); + Thread->Filters = ParentThread->Filters; + for (auto& Filter : ParentThread->Filters) { + // Need to increment all the refcounters + std::atomic_ref(Filter->RefCount)++; + } + Thread->SeccompMode = ParentThread->SeccompMode; + } +} + +// Equivalent to seccomp(SECCOMP_SET_MODE_FILTER, ...); +uint64_t SeccompEmulator::SetModeFilter(FEXCore::Core::CpuStateFrame* Frame, uint32_t flags, const sock_fprog* prog) { + auto Thread = FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame); + + // Order of checks in this function matter + // 1) Check flags + // 2) Check if program is invalid + uint32_t SUPPORTED_FLAGS = SECCOMP_FILTER_FLAG_TSYNC | // 1U << 0 + SECCOMP_FILTER_FLAG_LOG | // 1U << 1 + SECCOMP_FILTER_FLAG_SPEC_ALLOW | // 1U << 2 + // SECCOMP_FILTER_FLAG_NEW_LISTENER | // 1U << 3 + SECCOMP_FILTER_FLAG_TSYNC_ESRCH | // 1U << 4 + 0; + + const bool DoingTsync = flags & SECCOMP_FILTER_FLAG_TSYNC; + + if (flags & ~SUPPORTED_FLAGS) { + // Unknown flags passed in. + return -EINVAL; + } + + if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && !(flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)) { + /// If NEW_LISTENER and TSYNC are both used then TSYNC_ESRCH must also be set. + /// Otherwise on error there would be no way to tell the difference between success and failure. + return -EINVAL; + } + + if (!prog) { + return -EFAULT; + } + + if (::prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 0) { + // The caller did not have the CAP_SYS_ADMIN capability in its user namespace, or had not set no_new_privs before using SECCOMP_SET_MODE_FILTER. + return -EACCES; + } + + if (prog->len > BPF_MAXINSNS || prog->len == 0) { + // operation specified SECCOMP_SET_MODE_FILTER, but the filter program pointed to by args was not valid or the length of the filter + // program was zero or exceeded BPF_MAXINSNS (4096) instructions. + return -EINVAL; + } + + // Don't interrupt me while I'm jitting. + auto lk = FEXCore::MaskSignalsAndLockMutex(FilterMutex); + + const size_t TotalFinalInstructions = TotalFilterInstructions + prog->len + Thread->Filters.size() * BPF_MULTIFILTERPENALTY; + if (TotalFinalInstructions > BPF_MAX_INSNS_PER_PATH) { + return -ENOMEM; + } + + if constexpr (false) { + // Useful for debugging seccomp problems. + DumpProgram(prog); + } + + if (DoingTsync) { + auto TSyncThread = CanDoTSync(Frame); + if (TSyncThread != 0) { + if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) { + // This flag explicitly ensures that if TSYNC can't sync then it won't return a TID. + return -ESRCH; + } else { + // Return the TID that caused a tsync problem. + return TSyncThread; + } + } + } + + BPFEmitter emit {}; + const bool LoggingEnabled = flags & SECCOMP_FILTER_FLAG_LOG; + auto Result = emit.JITFilter(flags, prog); + if (Result == 0) { + + auto& it = Filters.emplace_back(FilterInformation {(FilterFunc)emit.GetFunc(), 1, emit.AllocationSize(), prog->len, LoggingEnabled}); + TotalFilterInstructions += prog->len; + + // Append the filter to the thread. + Thread->Filters.emplace_back(&it); + Thread->SeccompMode = SECCOMP_MODE_FILTER; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + TSyncFilters(Frame); + } + } + + return Result; +} + +// Equivalent to seccomp(SECCOMP_GET_ACTION_AVAIL, ...); +uint64_t SeccompEmulator::GetActionAvail(uint32_t flags, const uint32_t* action) { + if (flags != 0) { + // Unknown flags passed in + return -EINVAL; + } + + if (!action) { + // Invalid action + return -EFAULT; + } + switch (*action) { + case SECCOMP_RET_KILL_PROCESS: + case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_LOG: + case SECCOMP_RET_ALLOW: return 0; + case SECCOMP_RET_USER_NOTIF: + case SECCOMP_RET_TRACE: + default: break; + } + + return -EOPNOTSUPP; +} + +// Equivalent to seccomp(SECCOMP_GET_NOTIF_SIZES, ...); +uint64_t SeccompEmulator::GetNotifSizes(uint32_t flags, struct seccomp_notif_sizes* sizes) { + if (flags != 0) { + // Unknown flags passed in + return -EINVAL; + } + sizes->seccomp_notif = sizeof(struct seccomp_notif); + sizes->seccomp_notif_resp = sizeof(struct seccomp_notif_resp); + sizes->seccomp_data = sizeof(struct seccomp_data); + + return 0; +} + +} // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.h new file mode 100644 index 0000000000..e3cf4caff4 --- /dev/null +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Seccomp/SeccompEmulator.h @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: LinuxSyscalls|syscalls-shared +$end_info$ +*/ +#pragma once + +#include +#include +#include + +#include + +struct sock_fprog; +struct seccomp_data; +struct seccomp_notif_sizes; + +namespace FEXCore { + +namespace Core { + struct CpuStateFrame; +} + +namespace HLE { + struct SyscallArguments; +} + +} // namespace FEXCore + +namespace FEX::HLE { + +class SyscallHandler; +class SignalDelegator; +struct ThreadStateObject; + +class SeccompEmulator final { +public: + SeccompEmulator(FEX::HLE::SyscallHandler* SyscallHandler, FEX::HLE::SignalDelegator* SignalDelegation) + : SyscallHandler {SyscallHandler} + , SignalDelegation {SignalDelegation} {} + + uint64_t Handle(FEXCore::Core::CpuStateFrame* Frame, uint32_t Op, uint32_t flags, void* arg); + + // Equivalent to prctl(PR_GET_SECCOMP) + uint64_t GetSeccomp(FEXCore::Core::CpuStateFrame* Frame); + + void InheritSeccompFilters(FEX::HLE::ThreadStateObject* Parent, FEX::HLE::ThreadStateObject* Child); + void FreeSeccompFilters(FEX::HLE::ThreadStateObject* Thread); + + struct ExecuteFilterResult { + bool EarlyReturn {}; + uint64_t Result; + }; + ExecuteFilterResult ExecuteFilter(FEXCore::Core::CpuStateFrame* Frame, uint64_t JITPC, FEXCore::HLE::SyscallArguments* Args); + int GetKillSignal() const { + return CurrentKillSignal; + } + + std::optional SerializeFilters(FEXCore::Core::CpuStateFrame* Frame); + void DeserializeFilters(FEXCore::Core::CpuStateFrame* Frame, int FD); + + using FilterFunc = uint64_t (*)(uint32_t Acc, uint32_t Index, uint32_t Tmp, uint32_t Tmp2, void* Data); + struct FilterInformation final { + FilterFunc Func; + uint64_t RefCount; + size_t MappedSize; + uint32_t FilterInstructions; + bool ShouldLog; + }; + +private: + FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); + FEX_CONFIG_OPT(NeedsSeccomp, NEEDSSECCOMP); + FEX_CONFIG_OPT(Filename, APP_FILENAME); + FEX::HLE::SyscallHandler* SyscallHandler; + FEX::HLE::SignalDelegator* SignalDelegation; + + int CurrentKillSignal {SIGSYS}; + + // Equivalent to seccomp(SECCOMP_SET_MODE_STRICT, ...); + uint64_t SetModeStrict(FEXCore::Core::CpuStateFrame* Frame, uint32_t flags, const void* arg); + // Equivalent to seccomp(SECCOMP_SET_MODE_FILTER, ...); + uint64_t SetModeFilter(FEXCore::Core::CpuStateFrame* Frame, uint32_t flags, const sock_fprog* prog); + // Equivalent to seccomp(SECCOMP_GET_ACTION_AVAIL, ...); + uint64_t GetActionAvail(uint32_t flags, const uint32_t* action); + // Equivalent to seccomp(SECCOMP_GET_NOTIF_SIZES, ...); + uint64_t GetNotifSizes(uint32_t flags, struct seccomp_notif_sizes* sizes); + + // 0 on TSync possible + /// TID for the first thread that breaks tsync. + uint64_t CanDoTSync(FEXCore::Core::CpuStateFrame* Frame); + void TSyncFilters(FEXCore::Core::CpuStateFrame* Frame); + + static void DumpProgram(const sock_fprog* prog); + + // Multiple filter instruction count penalty. + // When multiple filters are installed there is a penalty per filter counted towards the maximum number of instructions. + constexpr static size_t BPF_MULTIFILTERPENALTY = 4; + // Maximum number of BPF instructions. + constexpr static size_t BPF_MAX_INSNS_PER_PATH = 32768; + uint64_t TotalFilterInstructions {}; + + FEXCore::ForkableUniqueMutex FilterMutex; + fextl::list Filters {}; + + uint64_t AuditSerialIncrement() { + return AuditSerial.fetch_add(1); + } + std::atomic AuditSerial {}; +}; +} // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp index c99d9de554..fe048e7024 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp @@ -799,6 +799,40 @@ void SignalDelegator::UninstallHostHandler(int Signal) { ::syscall(SYS_rt_sigaction, Signal, &SignalHandler.OldAction, nullptr, 8); } +void SignalDelegator::QueueSignal(pid_t tgid, pid_t tid, int Signal, siginfo_t* info, bool IgnoreMask) { + bool WasIgnored {}; + bool WasMasked {}; + SignalHandler& SignalHandler = HostHandlers[Signal]; + if (SignalHandler.GuestAction.sigaction_handler.handler == SIG_IGN && IgnoreMask) { + ::syscall(SYS_rt_sigaction, Signal, &SignalHandler.OldAction, nullptr, 8); + WasIgnored = true; + } + + // Get the current host signal mask + uint64_t ThreadSignalMask {}; + const uint64_t SignalMask = 1ULL << (Signal - 1); + ::syscall(SYS_rt_sigprocmask, 0, nullptr, &ThreadSignalMask, 8); + if (ThreadSignalMask & SignalMask) { + WasMasked = true; + + // Signal currently masked, unmask + ThreadSignalMask &= ~SignalMask; + ::syscall(SYS_rt_sigprocmask, 0, &ThreadSignalMask, &ThreadSignalMask, 8); + } + + ::syscall(SYSCALL_DEF(rt_tgsigqueueinfo), tgid, tid, Signal, info); + + if (WasMasked) { + // Mask again + ::syscall(SYS_rt_sigprocmask, 0, &ThreadSignalMask, nullptr, 8); + } + + if (WasIgnored) { + // Ignore again + ::syscall(SYS_rt_sigaction, Signal, &SignalHandler.HostAction, nullptr, 8); + } +} + SignalDelegator::SignalDelegator(FEXCore::Context::Context* _CTX, const std::string_view ApplicationName) : CTX {_CTX} , ApplicationName {ApplicationName} { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.h index c670b7f008..53a046cf20 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.h @@ -100,6 +100,8 @@ class SignalDelegator final : public FEXCore::SignalDelegator, public FEXCore::A void CheckXIDHandler(); void UninstallHostHandler(int Signal); + void QueueSignal(pid_t tgid, pid_t tid, int Signal, siginfo_t* info, bool IgnoreMask); + FEXCore::Context::Context* CTX; void SetVDSOSigReturn() { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp index 99c6d5f874..68ef77714c 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp @@ -43,6 +43,8 @@ desc: Glue logic, brk allocations #include #include #include +#include +#include #include #include #include @@ -51,6 +53,7 @@ desc: Glue logic, brk allocations #include #include #include +#include #include #include #include @@ -193,7 +196,7 @@ static bool IsShebangFilename(const fextl::string& Filename) { return IsShebang; } -uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* envp, ExecveAtArgs Args) { +uint64_t ExecveHandler(FEXCore::Core::CpuStateFrame* Frame, const char* pathname, char* const* argv, char* const* envp, ExecveAtArgs Args) { auto SyscallHandler = FEX::HLE::_SyscallHandler; fextl::string Filename {}; @@ -204,6 +207,7 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env const bool IsFDExec = (Args.flags & AT_EMPTY_PATH) && strlen(pathname) == 0; const bool SupportsProcFSInterpreter = SyscallHandler->FM.SupportsProcFSInterpreterPath(); fextl::string FDExecEnv; + fextl::string FDSeccompEnv; bool IsShebang {}; @@ -260,6 +264,21 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env char* const* EnvpPtr = envp; bool FDExecCopy {}; + auto SeccompFD = SyscallHandler->SeccompEmulator.SerializeFilters(Frame); + const auto HasSeccomp = SeccompFD.has_value() && *SeccompFD != -1; + + auto CloseSeccompFD = [&HasSeccomp, &SeccompFD]() { + if (HasSeccomp) { + close(*SeccompFD); + } + }; + + auto CloseFDExecFD = [&FDExecCopy, &Args]() { + if (FDExecCopy) { + close(Args.dirfd); + } + }; + // If we don't have the interpreter installed we need to be extra careful for ENOEXEC // Reasoning is that if we try executing a file from FEXLoader then this process loses the ENOEXEC flag // Kernel does its own checks for file format support for this @@ -282,7 +301,7 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env // TODO: Additional future tasks that require envp copying in the future: // - seccomp inheritance // - FEXServer FD inheritance (unshare(CLONE_NEWNET)) - const bool NeedsEnvpCopy = IsFDExec && !(IsBinfmtCompatible || IsOtherELF); + const bool NeedsEnvpCopy = (IsFDExec && !(IsBinfmtCompatible || IsOtherELF)) || HasSeccomp; if (NeedsEnvpCopy) { if (envp) { @@ -294,7 +313,7 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env } } - if (IsFDExec) { + if (IsFDExec && !IsBinfmtCompatible) { int Flags = fcntl(Args.dirfd, F_GETFD); if (Flags & FD_CLOEXEC) { // FEX needs the FD to live past execve when binfmt_misc isn't used, @@ -316,6 +335,15 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env EnvpArgs.emplace_back(FDExecEnv.data()); } + if (HasSeccomp) { + // Create the environment variable to pass the FD to our FEX. + // Needs to stick around until execveat completes. + FDSeccompEnv = fextl::fmt::format("FEX_SECCOMPFD={}", *SeccompFD); + + // Insert the FD for FEX to track. + EnvpArgs.emplace_back(FDSeccompEnv.data()); + } + // Emplace nullptr at the end to stop EnvpArgs.emplace_back(nullptr); @@ -325,6 +353,8 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env if (IsBinfmtCompatible || IsOtherELF) { Result = ::syscall(SYS_execveat, Args.dirfd, Filename.c_str(), argv, EnvpPtr, Args.flags); + CloseSeccompFD(); + CloseFDExecFD(); SYSCALL_ERRNO(); } @@ -364,11 +394,8 @@ uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* env const char* InterpreterPath = SupportsProcFSInterpreter ? "/proc/self/interpreter" : "/proc/self/exe"; Result = ::syscall(SYS_execveat, Args.dirfd, InterpreterPath, const_cast(ExecveArgs.data()), EnvpPtr, Args.flags); - - if (FDExecCopy) { - ///< Had to make a copy, close it now. - close(Args.dirfd); - } + CloseSeccompFD(); + CloseFDExecFD(); SYSCALL_ERRNO(); } @@ -719,6 +746,7 @@ void SyscallHandler::DefaultProgramBreak(uint64_t Base, uint64_t Size) { SyscallHandler::SyscallHandler(FEXCore::Context::Context* _CTX, FEX::HLE::SignalDelegator* _SignalDelegation) : TM {_CTX, _SignalDelegation} + , SeccompEmulator {this, _SignalDelegation} , FM {_CTX} , CTX {_CTX} , SignalDelegation {_SignalDelegation} { @@ -759,6 +787,15 @@ uint32_t SyscallHandler::CalculateGuestKernelVersion() { } uint64_t SyscallHandler::HandleSyscall(FEXCore::Core::CpuStateFrame* Frame, FEXCore::HLE::SyscallArguments* Args) { + // Grab the return address which will be inside the JIT. + const uint64_t JITPC = reinterpret_cast(__builtin_extract_return_addr(__builtin_return_address(0))); + + const auto SeccompResult = SeccompEmulator.ExecuteFilter(Frame, JITPC, Args); + + if (SeccompResult.EarlyReturn) { + return SeccompResult.Result; + } + if (Args->Argument[0] >= Definitions.size()) { return -ENOSYS; } diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.h index d83cc3b240..d864f37cd1 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.h @@ -11,6 +11,7 @@ desc: Glue logic, STRACE magic #include "LinuxSyscalls/FileManagement.h" #include "LinuxSyscalls/LinuxAllocator.h" #include "LinuxSyscalls/ThreadManager.h" +#include "LinuxSyscalls/Seccomp/SeccompEmulator.h" #include #include @@ -92,17 +93,25 @@ struct ExecveAtArgs { } }; -uint64_t ExecveHandler(const char* pathname, char* const* argv, char* const* envp, ExecveAtArgs Args); +uint64_t ExecveHandler(FEXCore::Core::CpuStateFrame* Frame, const char* pathname, char* const* argv, char* const* envp, ExecveAtArgs Args); class SyscallHandler : public FEXCore::HLE::SyscallHandler, FEXCore::HLE::SourcecodeResolver, public FEXCore::Allocator::FEXAllocOperators { public: ThreadManager TM; + FEX::HLE::SeccompEmulator SeccompEmulator; + virtual ~SyscallHandler(); // In the case that the syscall doesn't hit the optimized path then we still need to go here uint64_t HandleSyscall(FEXCore::Core::CpuStateFrame* Frame, FEXCore::HLE::SyscallArguments* Args) final override; void DefaultProgramBreak(uint64_t Base, uint64_t Size); + void DeserializeSeccompFD(FEX::HLE::ThreadStateObject* Thread, int FD) { + if (FD == -1) { + return; + } + SeccompEmulator.DeserializeFilters(Thread->Thread->CurrentFrame, FD); + } using SyscallPtrArg0 = uint64_t (*)(FEXCore::Core::CpuStateFrame* Frame); using SyscallPtrArg1 = uint64_t (*)(FEXCore::Core::CpuStateFrame* Frame, uint64_t); @@ -136,11 +145,19 @@ class SyscallHandler : public FEXCore::HLE::SyscallHandler, FEXCore::HLE::Source } FEXCore::HLE::SyscallABI GetSyscallABI(uint64_t Syscall) override { + if (NeedsSeccomp) { + // Override ABI if seccomp is enabled. + return {FEXCore::HLE::SyscallArguments::MAX_ARGS, true, -1}; + } auto& Def = Definitions.at(Syscall); return {Def.NumArgs, true, Def.HostSyscallNumber}; } FEXCore::IR::SyscallFlags GetSyscallFlags(uint64_t Syscall) const override { + if (NeedsSeccomp) { + // Override flags if seccomp is enabled. + return FEXCore::IR::SyscallFlags::DEFAULT; + } auto& Def = Definitions.at(Syscall); return Def.Flags; } @@ -178,6 +195,7 @@ class SyscallHandler : public FEXCore::HLE::SyscallHandler, FEXCore::HLE::Source FEX_CONFIG_OPT(RootFSPath, ROOTFS); FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); FEX_CONFIG_OPT(SMCChecks, SMCCHECKS); + FEX_CONFIG_OPT(NeedsSeccomp, NEEDSSECCOMP); uint32_t GetHostKernelVersion() const { return HostKernelVersion; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Info.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Info.cpp index d65bcefff2..a861675ddc 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Info.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Info.cpp @@ -62,8 +62,7 @@ void RegisterInfo(FEX::HLE::SyscallHandler* Handler) { REGISTER_SYSCALL_IMPL_FLAGS(seccomp, SyscallFlags::OPTIMIZETHROUGH | SyscallFlags::NOSYNCSTATEONENTRY, [](FEXCore::Core::CpuStateFrame* Frame, unsigned int operation, unsigned int flags, void* args) -> uint64_t { - // FEX doesn't support seccomp - return -EINVAL; + return FEX::HLE::_SyscallHandler->SeccompEmulator.Handle(Frame, operation, flags, args); }); } } // namespace FEX::HLE diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp index 0cdfc6a9cb..59b40dd568 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp @@ -25,6 +25,7 @@ tags: LinuxSyscalls|syscalls-shared #include #include #include +#include #include #include #include @@ -78,7 +79,8 @@ FEX::HLE::ThreadStateObject* CreateNewThread(FEXCore::Context::Context* CTX, FEX NewThreadState.gregs[FEXCore::X86State::REG_RSP] = args->args.stack; } - auto NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, args->args.parent_tid); + auto NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, args->args.parent_tid, + FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); if (FEX::HLE::_SyscallHandler->Is64BitMode()) { if (flags & CLONE_SETTLS) { @@ -171,7 +173,8 @@ uint64_t HandleNewClone(FEX::HLE::ThreadStateObject* Thread, FEXCore::Context::C } // Overwrite thread - NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, GuestArgs->parent_tid); + NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, GuestArgs->parent_tid, + FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); // CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID, CLONE_PIDFD will be handled by kernel // Call execution thread directly since we already are on the new thread @@ -412,11 +415,14 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { #define PR_GET_AUXV 0x41555856 #endif switch (option) { - case PR_SET_SECCOMP: - case PR_GET_SECCOMP: - // FEX doesn't support seccomp - return -EINVAL; - break; + case PR_SET_SECCOMP: { + uint32_t Operation {}; + if (arg2 == SECCOMP_MODE_STRICT) Operation = SECCOMP_SET_MODE_STRICT; + if (arg2 == SECCOMP_MODE_FILTER) Operation = SECCOMP_SET_MODE_FILTER; + + return FEX::HLE::_SyscallHandler->SeccompEmulator.Handle(Frame, Operation, 0, reinterpret_cast(arg3)); + } + case PR_GET_SECCOMP: return FEX::HLE::_SyscallHandler->SeccompEmulator.GetSeccomp(Frame); case PR_GET_AUXV: { if (arg4 || arg5) { return -EINVAL; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp index a8aa01fc25..8c905ed0e3 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp @@ -6,8 +6,8 @@ #include namespace FEX::HLE { -FEX::HLE::ThreadStateObject* -ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID) { +FEX::HLE::ThreadStateObject* ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState, + uint64_t ParentTID, FEX::HLE::ThreadStateObject* InheritThread) { auto ThreadStateObject = new FEX::HLE::ThreadStateObject; ThreadStateObject->ThreadInfo.parent_tid = ParentTID; @@ -20,6 +20,10 @@ ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore: ThreadStateObject->Thread = CTX->CreateThread(InitialRIP, StackPointer, NewThreadState, ParentTID); ThreadStateObject->Thread->FrontendPtr = ThreadStateObject; + if (InheritThread) { + FEX::HLE::_SyscallHandler->SeccompEmulator.InheritSeccompFilters(InheritThread, ThreadStateObject); + } + ++IdleWaitRefCount; return ThreadStateObject; } @@ -58,6 +62,8 @@ void ThreadManager::HandleThreadDeletion(FEX::HLE::ThreadStateObject* Thread, bo } CTX->DestroyThread(Thread->Thread, NeedsTLSUninstall); + FEX::HLE::_SyscallHandler->SeccompEmulator.FreeSeccompFilters(Thread); + delete Thread; --IdleWaitRefCount; IdleWaitCV.notify_all(); diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h index 68e5e6912e..ebc02c80f2 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h @@ -9,12 +9,14 @@ desc: Frontend thread management #pragma once #include "LinuxSyscalls/Types.h" +#include "LinuxSyscalls/Seccomp/SeccompEmulator.h" #include #include #include #include +#include namespace FEX::HLE { class SyscallHandler; @@ -49,10 +51,15 @@ struct ThreadStateObject : public FEXCore::Allocator::FEXAllocOperators { uint64_t PendingSignals {}; } SignalInfo {}; + + // Seccomp thread specific data. + uint32_t SeccompMode {SECCOMP_MODE_DISABLED}; + fextl::vector Filters {}; }; class ThreadManager final { public: + ThreadManager(FEXCore::Context::Context* CTX, FEX::HLE::SignalDelegator* SignalDelegation) : CTX {CTX} , SignalDelegation {SignalDelegation} {} @@ -68,8 +75,8 @@ class ThreadManager final { return static_cast(Thread->FrontendPtr); } - FEX::HLE::ThreadStateObject* - CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState = nullptr, uint64_t ParentTID = 0); + FEX::HLE::ThreadStateObject* CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState = nullptr, + uint64_t ParentTID = 0, FEX::HLE::ThreadStateObject* InheritThread = nullptr); void TrackThread(FEX::HLE::ThreadStateObject* Thread) { std::lock_guard lk(ThreadCreationMutex); Threads.emplace_back(Thread); diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/x32/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/x32/Thread.cpp index c4bd9cd7d0..332bb1a346 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/x32/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/x32/Thread.cpp @@ -277,7 +277,7 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { FEX::HLE::ExecveAtArgs AtArgs = FEX::HLE::ExecveAtArgs::Empty(); - return FEX::HLE::ExecveHandler(pathname, ArgsPtr, EnvpPtr, AtArgs); + return FEX::HLE::ExecveHandler(Frame, pathname, ArgsPtr, EnvpPtr, AtArgs); }); REGISTER_SYSCALL_IMPL_X32( @@ -307,7 +307,7 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { auto* const* ArgsPtr = argv ? const_cast(Args.data()) : nullptr; auto* const* EnvpPtr = envp ? const_cast(Envp.data()) : nullptr; - return FEX::HLE::ExecveHandler(pathname, ArgsPtr, EnvpPtr, AtArgs); + return FEX::HLE::ExecveHandler(Frame, pathname, ArgsPtr, EnvpPtr, AtArgs); })); REGISTER_SYSCALL_IMPL_X32(wait4, [](FEXCore::Core::CpuStateFrame* Frame, pid_t pid, int* wstatus, int options, struct rusage_32* rusage) -> uint64_t { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/x64/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/x64/Thread.cpp index 017bb4e93e..3d57912708 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/x64/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/x64/Thread.cpp @@ -103,7 +103,7 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { FEX::HLE::ExecveAtArgs AtArgs = FEX::HLE::ExecveAtArgs::Empty(); - return FEX::HLE::ExecveHandler(pathname, ArgsPtr, EnvpPtr, AtArgs); + return FEX::HLE::ExecveHandler(Frame, pathname, ArgsPtr, EnvpPtr, AtArgs); }); REGISTER_SYSCALL_IMPL_X64_FLAGS( @@ -135,7 +135,7 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { auto* const* ArgsPtr = argv ? const_cast(Args.data()) : nullptr; auto* const* EnvpPtr = envp ? const_cast(Envp.data()) : nullptr; - return FEX::HLE::ExecveHandler(pathname, ArgsPtr, EnvpPtr, AtArgs); + return FEX::HLE::ExecveHandler(Frame, pathname, ArgsPtr, EnvpPtr, AtArgs); })); } } // namespace FEX::HLE::x64