Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making thread_local fiber_local #618

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions glibc-2.40.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -210,7 +210,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
new stack or reusing a cached stack of sufficient size.
ATTR must be non-NULL and point to a valid pthread_attr.
PDP must be non-NULL. */
-static int
+int
Copy link
Collaborator Author

@lihuiba lihuiba Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

making the internal function allocate_stack() in lib pthread public and accessible by, say, fiber_create().

allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
void **stack, size_t *stacksize)
{
@@ -403,6 +403,18 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
}
}

+ /* Initialize the TCB. All initializations with zero should be
+ performed in 'get_cached_stack'. This way we avoid doing this if
+ the stack freshly allocated with 'mmap'. */
+
+#if TLS_TCB_AT_TP
+ /* Reference to the TCB itself. */
+ pd->header.self = pd;
+
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These lines are moved from pthread_create(), because fiber_create() doesn't have the definition of *pd, and can not do it outside.

+ /* Self-reference for TLS. */
+ pd->header.tcb = pd;
+#endif
+
/* Remember the stack-related values. */
pd->stackblock = mem;
pd->stackblock_size = size;
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 1d3665d5..6ee79035 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -665,18 +665,6 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
}


- /* Initialize the TCB. All initializations with zero should be
- performed in 'get_cached_stack'. This way we avoid doing this if
- the stack freshly allocated with 'mmap'. */
-
-#if TLS_TCB_AT_TP
- /* Reference to the TCB itself. */
- pd->header.self = pd;
-
- /* Self-reference for TLS. */
- pd->header.tcb = pd;
-#endif
-
/* Store the address of the start routine and the parameter. Since
we do not start the function directly the stillborn thread will
get the information from its thread descriptor. */
64 changes: 30 additions & 34 deletions thread/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ limitations under the License.
#include <thread>
#include <mutex>
#include <condition_variable>
#include <pthread.h>

#ifdef _WIN64
#include <processthreadsapi.h>
Expand All @@ -53,6 +54,10 @@ inline int posix_memalign(void** memptr, size_t alignment, size_t size) {
#include <photon/thread/thread-key.h>
#include <photon/thread/arch.h>

struct pthread_attr;
extern "C" int allocate_stack(const struct pthread_attr *attr,
struct pthread **pdp, void **stack, size_t *stacksize);

/* notes on the scheduler:

1. runq (denoted by CURRENT) and sleepq are compeltely private,
Expand Down Expand Up @@ -173,6 +178,7 @@ namespace photon
struct thread : public intrusive_list_node<thread> {
volatile vcpu_t* vcpu;
Stack stack;
pthread_t tcb_or_tp;
// offset 32B
int idx = -1; /* index in the sleep queue array */
int error_number = 0;
Expand Down Expand Up @@ -258,7 +264,8 @@ namespace photon
stack_size = stack_high - stack_low;
#elif defined(__linux__)
pthread_attr_t gattr;
pthread_getattr_np(pthread_self(), &gattr);
tcb_or_tp = pthread_self();
pthread_getattr_np(tcb_or_tp, &gattr);
pthread_attr_getstack(&gattr,
(void**)&stackful_alloc_top, &stack_size);
pthread_attr_destroy(&gattr);
Expand Down Expand Up @@ -287,6 +294,7 @@ namespace photon
#pragma GCC diagnostic ignored "-Winvalid-offsetof"
static_assert(offsetof(thread, vcpu) == offsetof(partial_thread, vcpu), "...");
static_assert(offsetof(thread, tls) == offsetof(partial_thread, tls), "...");
static_assert(offsetof(thread, tcb_or_tp) - offsetof(thread, stack) == 8, "...");
#pragma GCC diagnostic pop

struct thread_list : public intrusive_list<thread>
Expand Down Expand Up @@ -534,14 +542,7 @@ namespace photon
vcpu = current->get_vcpu();
(plock = &vcpu->runq_lock) -> foreground_lock();
}
mutable bool update_current = false;
void set_current(thread* th) const {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no longer the need to set/switch the thread-local variable CURRENT as part of context switching, given that it has become a fiber-local varible.

current = th;
update_current = true;
}
~AtomicRunQ() {
if (update_current)
*pc = current;
plock->foreground_unlock();
}
static void prefetch_context(thread* from, thread* to)
Expand All @@ -560,7 +561,6 @@ namespace photon
assert(!current->single());
auto from = current;
auto to = from->remove_from_list();
set_current(to);
prefetch_context(from, to);
from->state = new_state;
to->state = states::RUNNING;
Expand All @@ -571,7 +571,6 @@ namespace photon
prefetch_context(from, to);
from->state = states::READY;
to->state = states::RUNNING;
set_current(to);
return {from, to};
}
Switch goto_next() const {
Expand Down Expand Up @@ -618,12 +617,6 @@ namespace photon
}
};

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Winvalid-offsetof"
static_assert(offsetof(thread, arg) == 0x40, "...");
static_assert(offsetof(thread, start) == 0x48, "...");
#pragma GCC diagnostic pop

inline void thread::dequeue_ready_atomic(states newstat)
{
assert("this is not in runq, and this->lock is locked");
Expand All @@ -648,8 +641,6 @@ namespace photon
to->get_vcpu()->switch_count++;
}

static void _photon_thread_die(thread* th) asm("_photon_thread_die");

#if defined(__x86_64__)
#if !defined(_WIN64)
asm(
Expand All @@ -659,6 +650,8 @@ R"(
mov %rsp, (%rsi)
mov (%rdi), %rsp
pop %rbp
mov 8(%rdi), %rax
wrfsbase %rax
ret
)"

Expand All @@ -672,18 +665,10 @@ DEF_ASM_FUNC(_photon_switch_context_defer_die) // (void* rdi_arg, void (*rsi_def
R"(
mov (%rdx), %rsp
pop %rbp
mov 8(%rdx), %rax
wrfsbase %rax
jmp *%rsi
)"

DEF_ASM_FUNC(_photon_thread_stub)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_photon_thread_stub() becomes more complex and would better be implemented in C++.

R"(
mov 0x40(%rbp), %rdi
movq $0, 0x40(%rbp)
call *0x48(%rbp)
mov %rax, 0x48(%rbp)
mov %rbp, %rdi
call _photon_thread_die
)"
);

inline void switch_context(thread* from, thread* to) {
Expand Down Expand Up @@ -896,14 +881,17 @@ R"(
_photon_switch_context_defer_die(
arg, func, sw.to->stack.pointer_ref());
}
static __attribute__((used, noreturn))
void _photon_thread_die(thread* th) {
static __attribute__((noreturn))
void _photon_thread_stub() {
register thread* th asm("rbp");
Copy link
Collaborator Author

@lihuiba lihuiba Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On entry rbp points to struct thread.

CURRENT = th; // CURRENT is now fiber-local
auto arg = th->arg;
th->tls = 0; // union with th->arg
th->retval = th->start(arg);
assert(th == CURRENT);
th->die();
}

extern "C" void _photon_thread_stub() asm ("_photon_thread_stub");

thread* thread_create(thread_entry start, void* arg,
uint64_t stack_size, uint16_t reserved_space) {
RunQ rq;
Expand All @@ -922,7 +910,14 @@ R"(
stack_size, least_stack_size, least_stack_size);
stack_size = least_stack_size;
}
char* ptr = (char*)photon_thread_alloc(stack_size);
// char* ptr = (char*)photon_thread_alloc(stack_size);
struct pthread* pd;
char* ptr; // ptr to stack
size_t pstacksize;
pthread_attr_t attr;
pthread_attr_init(&attr);
allocate_stack((struct pthread_attr*)&attr, &pd, (void**)&ptr, &pstacksize);
Copy link
Collaborator Author

@lihuiba lihuiba Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We must make use of allocate_thread() in lib pthread to allocate the stack, TCB (that compatible to threading facilities) and TLS (for all modules, no matter loaded at startup time or run time).


if (unlikely(!ptr))
return nullptr;
uint64_t p = (uint64_t)ptr + stack_size - sizeof(thread) - randomizer;
Expand All @@ -934,6 +929,7 @@ R"(
th->stack_size = stack_size;
th->arg = arg;
auto sp = align_down(p - reserved_space, 64);
th->tcb_or_tp = (pthread_t)pd;
th->stack.init((void*)sp, &_photon_thread_stub, th);
AtomicRunQ arq(rq);
th->vcpu = arq.vcpu;
Expand Down Expand Up @@ -1405,7 +1401,7 @@ R"(
}
void thread_exit(void* retval) {
CURRENT->retval = retval;
_photon_thread_die(CURRENT);
CURRENT->die();
}

int thread_shutdown(thread* th, bool flag)
Expand Down
2 changes: 1 addition & 1 deletion thread/thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ namespace photon
struct partial_thread {
uint64_t _, __;
volatile vcpu_base* vcpu;
uint64_t ___[5];
uint64_t ___[6];
void* tls;
};

Expand Down
Loading