diff --git a/glibc-2.40.patch b/glibc-2.40.patch
new file mode 100644
index 00000000..c1bcf1e6
--- /dev/null
+++ b/glibc-2.40.patch
@@ -0,0 +1,53 @@
+--- a/nptl/allocatestack.c
++++ b/nptl/allocatestack.c
+@@ -210,7 +210,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
+    new stack or reusing a cached stack of sufficient size.
+    ATTR must be non-NULL and point to a valid pthread_attr.
+    PDP must be non-NULL.  */
+-static int
++int
+ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
+                void **stack, size_t *stacksize)
+ {
+@@ -403,6 +403,18 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
+                }
+            }
+
++  /* Initialize the TCB.  All initializations with zero should be
++     performed in 'get_cached_stack'.  This way we avoid doing this if
++     the stack freshly allocated with 'mmap'.  */
++
++#if TLS_TCB_AT_TP
++    /* Reference to the TCB itself.  */
++    pd->header.self = pd;
++
++    /* Self-reference for TLS.  */
++    pd->header.tcb = pd;
++#endif
++
+          /* Remember the stack-related values.  */
+          pd->stackblock = mem;
+          pd->stackblock_size = size;
+diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
+index 1d3665d5..6ee79035 100644
+--- a/nptl/pthread_create.c
++++ b/nptl/pthread_create.c
+@@ -665,18 +665,6 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
+     }
+
+
+-  /* Initialize the TCB.  All initializations with zero should be
+-     performed in 'get_cached_stack'.  This way we avoid doing this if
+-     the stack freshly allocated with 'mmap'.  */
+-
+-#if TLS_TCB_AT_TP
+-  /* Reference to the TCB itself.  */
+-  pd->header.self = pd;
+-
+-  /* Self-reference for TLS.  */
+-  pd->header.tcb = pd;
+-#endif
+-
+   /* Store the address of the start routine and the parameter.  Since
+      we do not start the function directly the stillborn thread will
+      get the information from its thread descriptor.  */
diff --git a/thread/thread.cpp b/thread/thread.cpp
index f3595163..ed721f40 100644
--- a/thread/thread.cpp
+++ b/thread/thread.cpp
@@ -31,6 +31,7 @@ limitations under the License.
 #include <thread>
 #include <mutex>
 #include <condition_variable>
+#include <pthread.h>
 
 #ifdef _WIN64
 #include <processthreadsapi.h>
@@ -53,6 +54,10 @@ inline int posix_memalign(void** memptr, size_t alignment, size_t size) {
 #include <photon/thread/thread-key.h>
 #include <photon/thread/arch.h>
 
+struct pthread_attr;
+extern "C" int allocate_stack(const struct pthread_attr *attr,
+    struct pthread **pdp, void **stack, size_t *stacksize);
+
 /* notes on the scheduler:
 
 1. runq (denoted by CURRENT) and sleepq are compeltely private,
@@ -173,6 +178,7 @@ namespace photon
     struct thread : public intrusive_list_node<thread> {
         volatile vcpu_t* vcpu;
         Stack stack;
+        pthread_t tcb_or_tp;
 // offset 32B
         int idx = -1;                       /* index in the sleep queue array */
         int error_number = 0;
@@ -258,7 +264,8 @@ namespace photon
             stack_size = stack_high - stack_low;
 #elif defined(__linux__)
             pthread_attr_t gattr;
-            pthread_getattr_np(pthread_self(), &gattr);
+            tcb_or_tp = pthread_self();
+            pthread_getattr_np(tcb_or_tp, &gattr);
             pthread_attr_getstack(&gattr,
                 (void**)&stackful_alloc_top, &stack_size);
             pthread_attr_destroy(&gattr);
@@ -287,6 +294,7 @@ namespace photon
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
     static_assert(offsetof(thread, vcpu) == offsetof(partial_thread, vcpu), "...");
     static_assert(offsetof(thread,  tls) == offsetof(partial_thread,  tls), "...");
+    static_assert(offsetof(thread, tcb_or_tp) - offsetof(thread, stack) == 8, "...");
 #pragma GCC diagnostic pop
 
     struct thread_list : public intrusive_list<thread>
@@ -534,14 +542,7 @@ namespace photon
             vcpu = current->get_vcpu();
             (plock = &vcpu->runq_lock) -> foreground_lock();
         }
-        mutable bool update_current = false;
-        void set_current(thread* th) const {
-            current = th;
-            update_current = true;
-        }
         ~AtomicRunQ() {
-            if (update_current)
-                *pc = current;
             plock->foreground_unlock();
         }
         static void prefetch_context(thread* from, thread* to)
@@ -560,7 +561,6 @@ namespace photon
             assert(!current->single());
             auto from = current;
             auto to = from->remove_from_list();
-            set_current(to);
             prefetch_context(from, to);
             from->state = new_state;
             to->state = states::RUNNING;
@@ -571,7 +571,6 @@ namespace photon
             prefetch_context(from, to);
             from->state = states::READY;
             to->state = states::RUNNING;
-            set_current(to);
             return {from, to};
         }
         Switch goto_next() const {
@@ -618,12 +617,6 @@ namespace photon
         }
     };
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-    static_assert(offsetof(thread, arg)   == 0x40, "...");
-    static_assert(offsetof(thread, start) == 0x48, "...");
-#pragma GCC diagnostic pop
-
     inline void thread::dequeue_ready_atomic(states newstat)
     {
         assert("this is not in runq, and this->lock is locked");
@@ -648,8 +641,6 @@ namespace photon
         to->get_vcpu()->switch_count++;
     }
 
-    static void _photon_thread_die(thread* th) asm("_photon_thread_die");
-
 #if defined(__x86_64__)
 #if !defined(_WIN64)
     asm(
@@ -659,6 +650,8 @@ R"(
         mov     %rsp, (%rsi)
         mov     (%rdi), %rsp
         pop     %rbp
+        mov     8(%rdi), %rax
+        wrfsbase %rax
         ret
 )"
 
@@ -672,18 +665,10 @@ DEF_ASM_FUNC(_photon_switch_context_defer_die) // (void* rdi_arg, void (*rsi_def
 R"(
         mov     (%rdx), %rsp
         pop     %rbp
+        mov     8(%rdx), %rax
+        wrfsbase %rax
         jmp     *%rsi
 )"
-
-DEF_ASM_FUNC(_photon_thread_stub)
-R"(
-        mov     0x40(%rbp), %rdi
-        movq    $0, 0x40(%rbp)
-        call    *0x48(%rbp)
-        mov     %rax, 0x48(%rbp)
-        mov     %rbp, %rdi
-        call    _photon_thread_die
-)"
     );
 
     inline void switch_context(thread* from, thread* to) {
@@ -896,14 +881,17 @@ R"(
         _photon_switch_context_defer_die(
             arg, func, sw.to->stack.pointer_ref());
     }
-    static __attribute__((used, noreturn))
-    void _photon_thread_die(thread* th) {
+    static __attribute__((noreturn))
+    void _photon_thread_stub() {
+        register thread* th asm("rbp");
+        CURRENT = th;   // CURRENT is now fiber-local
+        auto arg = th->arg;
+        th->tls = 0;    // union with th->arg
+        th->retval = th->start(arg);
         assert(th == CURRENT);
         th->die();
     }
 
-    extern "C" void _photon_thread_stub() asm ("_photon_thread_stub");
-
     thread* thread_create(thread_entry start, void* arg,
                 uint64_t stack_size, uint16_t reserved_space) {
         RunQ rq;
@@ -922,7 +910,14 @@ R"(
                      stack_size, least_stack_size, least_stack_size);
             stack_size = least_stack_size;
         }
-        char* ptr = (char*)photon_thread_alloc(stack_size);
+        // char* ptr = (char*)photon_thread_alloc(stack_size);
+        struct pthread* pd;
+        char* ptr;  // ptr to stack
+        size_t pstacksize;
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        allocate_stack((struct pthread_attr*)&attr, &pd, (void**)&ptr, &pstacksize);
+
         if (unlikely(!ptr))
             return nullptr;
         uint64_t p = (uint64_t)ptr + stack_size - sizeof(thread) - randomizer;
@@ -934,6 +929,7 @@ R"(
         th->stack_size = stack_size;
         th->arg = arg;
         auto sp = align_down(p - reserved_space, 64);
+        th->tcb_or_tp = (pthread_t)pd;
         th->stack.init((void*)sp, &_photon_thread_stub, th);
         AtomicRunQ arq(rq);
         th->vcpu = arq.vcpu;
@@ -1405,7 +1401,7 @@ R"(
     }
     void thread_exit(void* retval) {
         CURRENT->retval = retval;
-        _photon_thread_die(CURRENT);
+        CURRENT->die();
     }
 
     int thread_shutdown(thread* th, bool flag)
diff --git a/thread/thread.h b/thread/thread.h
index e99999eb..069bdce5 100644
--- a/thread/thread.h
+++ b/thread/thread.h
@@ -145,7 +145,7 @@ namespace photon
     struct partial_thread {
         uint64_t _, __;
         volatile vcpu_base* vcpu;
-        uint64_t ___[5];
+        uint64_t ___[6];
         void* tls;
     };