Index: libcfa/src/concurrency/CtxSwitch-i386.S
===================================================================
--- libcfa/src/concurrency/CtxSwitch-i386.S	(revision 3c06bba7aa43dce1e45992856c5195d83a055cdf)
+++ libcfa/src/concurrency/CtxSwitch-i386.S	(revision deca0f54cfb65fe7ba96231b54e8f4c6be2201cc)
@@ -52,10 +52,4 @@
 	movl 4(%esp),%eax
 
-	// Save floating & SSE control words on the stack.
-
-	sub    $8,%esp
-	stmxcsr 0(%esp)         // 4 bytes
-	fnstcw  4(%esp)         // 2 bytes
-
 	// Save volatile registers on the stack.
 
@@ -86,10 +80,4 @@
 	popl %ebx
 
-	// Load floating & SSE control words from the stack.
-
-	fldcw   4(%esp)
-	ldmxcsr 0(%esp)
-	add    $8,%esp
-
 	// Return to thread.
 
Index: libcfa/src/concurrency/CtxSwitch-x86_64.S
===================================================================
--- libcfa/src/concurrency/CtxSwitch-x86_64.S	(revision 3c06bba7aa43dce1e45992856c5195d83a055cdf)
+++ libcfa/src/concurrency/CtxSwitch-x86_64.S	(revision deca0f54cfb65fe7ba96231b54e8f4c6be2201cc)
@@ -46,10 +46,4 @@
 CtxSwitch:
 
-	// Save floating & SSE control words on the stack.
-
-	subq   $8,%rsp
-	stmxcsr 0(%rsp)         // 4 bytes
-	fnstcw  4(%rsp)         // 2 bytes
-
 	// Save volatile registers on the stack.
 
@@ -78,70 +72,8 @@
 	popq %r15
 
-	// Load floating & SSE control words from the stack.
-
-	fldcw   4(%rsp)
-	ldmxcsr 0(%rsp)
-	addq   $8,%rsp
-
 	// Return to thread.
 
 	ret
 	.size  CtxSwitch, .-CtxSwitch
-
-
-//.text
-//	.align 2
-//.globl	CtxStore
-//CtxStore:
-//	// Save floating & SSE control words on the stack.
-//
-//	subq   $8,%rsp
-//	stmxcsr 0(%rsp)         // 4 bytes
-//	fnstcw  4(%rsp)         // 2 bytes
-//
-//	// Save volatile registers on the stack.
-//
-//	pushq %r15
-//	pushq %r14
-//	pushq %r13
-//	pushq %r12
-//	pushq %rbx
-//
-//	// Save old context in the "from" area.
-//
-//	movq %rsp,SP_OFFSET(%rdi)
-//	movq %rbp,FP_OFFSET(%rdi)
-//
-//	// Return to thread
-//
-//	ret
-//
-//.text
-//	.align 2
-//.globl 	CtxRet
-//CtxRet:
-//	// Load new context from the "to" area.
-//
-//	movq SP_OFFSET(%rdi),%rsp
-//	movq FP_OFFSET(%rdi),%rbp
-//
-//	// Load volatile registers from the stack.
-//
-//	popq %rbx
-//	popq %r12
-//	popq %r13
-//	popq %r14
-//	popq %r15
-//
-//	// Load floating & SSE control words from the stack.
-//
-//	fldcw   4(%rsp)
-//	ldmxcsr 0(%rsp)
-//	addq   $8,%rsp
-//
-//	// Return to thread.
-//
-//	ret
-
 
 .text
Index: libcfa/src/concurrency/invoke.c
===================================================================
--- libcfa/src/concurrency/invoke.c	(revision 3c06bba7aa43dce1e45992856c5195d83a055cdf)
+++ libcfa/src/concurrency/invoke.c	(revision deca0f54cfb65fe7ba96231b54e8f4c6be2201cc)
@@ -124,6 +124,4 @@
 	struct FakeStack {
 	    void *fixedRegisters[3];		  	// fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
-	    uint32_t mxcr;                        // SSE Status and Control bits (control bits are preserved across function calls)
-	    uint16_t fcw;                         // X97 FPU control word (preserved across function calls)
 	    void *rturn;                          // where to go on return from uSwitch
 	    void *dummyReturn;				// fake return compiler would have pushed on call to uInvoke
@@ -140,6 +138,4 @@
 	fs->argument[0] = this;     // argument to invoke
 	fs->rturn = invoke;
-	fs->mxcr = 0x1F80; //Vol. 2A 3-520
-	fs->fcw = 0x037F;  //Vol. 1 8-7
 
 #elif defined( __x86_64 )
@@ -147,6 +143,4 @@
 	struct FakeStack {
 		void *fixedRegisters[5];            // fixed registers rbx, r12, r13, r14, r15
-		uint32_t mxcr;                      // SSE Status and Control bits (control bits are preserved across function calls)
-		uint16_t fcw;                       // X97 FPU control word (preserved across function calls)
 		void *rturn;                        // where to go on return from uSwitch
 		void *dummyReturn;                  // NULL return address to provide proper alignment
@@ -162,6 +156,4 @@
 	fs->fixedRegisters[0] = this;
 	fs->fixedRegisters[1] = invoke;
-	fs->mxcr = 0x1F80; //Vol. 2A 3-520
-	fs->fcw = 0x037F;  //Vol. 1 8-7
 
 #elif defined( __ARM_ARCH )
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 3c06bba7aa43dce1e45992856c5195d83a055cdf)
+++ libcfa/src/concurrency/invoke.h	(revision deca0f54cfb65fe7ba96231b54e8f4c6be2201cc)
@@ -264,23 +264,4 @@
 	// void CtxRet   ( void * dst  ) asm ("CtxRet");
 
-	#if   defined( __i386 )
-	#define CtxGet( ctx ) __asm__ ( \
-			"movl %%esp,%0\n"   \
-			"movl %%ebp,%1\n"   \
-		: "=rm" (ctx.SP), "=rm" (ctx.FP) )
-	#elif defined( __x86_64 )
-	#define CtxGet( ctx ) __asm__ ( \
-			"movq %%rsp,%0\n"   \
-			"movq %%rbp,%1\n"   \
-		: "=rm" (ctx.SP), "=rm" (ctx.FP) )
-	#elif defined( __ARM_ARCH )
-	#define CtxGet( ctx ) __asm__ ( \
-			"mov %0,%%sp\n"   \
-			"mov %1,%%r11\n"   \
-		: "=rm" (ctx.SP), "=rm" (ctx.FP) )
-	#else
-		#error unknown hardware architecture
-	#endif
-
 #endif //_INVOKE_PRIVATE_H_
 #endif //! defined(__CFA_INVOKE_PRIVATE__)
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 3c06bba7aa43dce1e45992856c5195d83a055cdf)
+++ libcfa/src/concurrency/kernel.cfa	(revision deca0f54cfb65fe7ba96231b54e8f4c6be2201cc)
@@ -36,4 +36,73 @@
 #include "invoke.h"
 
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if   defined( __i386 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movl %%esp,%0\n"\
+			"movl %%ebp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+
+	// mxcr : SSE Status and Control bits (control bits are preserved across function calls)
+	// fcw  : X87 FPU control word (preserved across function calls)
+	#define __x87_store         \
+		uint32_t __mxcr;      \
+		uint16_t __fcw;       \
+		__asm__ volatile (    \
+			"stmxcsr %0\n"  \
+			"fnstcw  %1\n"  \
+			: "=m" (__mxcr),\
+				"=m" (__fcw)  \
+		)
+
+	#define __x87_load         \
+		__asm__ volatile (   \
+			"fldcw  %1\n"  \
+			"ldmxcsr %0\n" \
+			::"m" (__mxcr),\
+				"m" (__fcw)  \
+		)
+
+#elif defined( __x86_64 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movq %%rsp,%0\n"\
+			"movq %%rbp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+
+	#define __x87_store         \
+		uint32_t __mxcr;      \
+		uint16_t __fcw;       \
+		__asm__ volatile (    \
+			"stmxcsr %0\n"  \
+			"fnstcw  %1\n"  \
+			: "=m" (__mxcr),\
+				"=m" (__fcw)  \
+		)
+
+	#define __x87_load          \
+		__asm__ volatile (    \
+			"fldcw  %1\n"   \
+			"ldmxcsr %0\n"  \
+			:: "m" (__mxcr),\
+				"m" (__fcw)  \
+		)
+
+
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+		"mov %0,%%sp\n"   \
+		"mov %1,%%r11\n"   \
+	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#else
+	#error unknown hardware architecture
+#endif
+
+//-----------------------------------------------------------------------------
 //Start and stop routine for the kernel, declared first to make sure they run first
 static void kernel_startup(void)  __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
@@ -274,4 +343,7 @@
 	proc_cor->state = Active;
 	int local_errno = *__volatile_errno();
+	#if defined( __i386 ) || defined( __x86_64 )
+		__x87_store;
+	#endif
 
 	// set new coroutine that the processor is executing
@@ -283,4 +355,8 @@
 	proc_cor->state = proc_cor->state == Halted ? Halted : Inactive;
 	thrd_src->state = Active;
+
+	#if defined( __i386 ) || defined( __x86_64 )
+		__x87_load;
+	#endif
 	*__volatile_errno() = local_errno;
 }
