Index: src/libcfa/concurrency/CtxSwitch-i386.S
===================================================================
--- src/libcfa/concurrency/CtxSwitch-i386.S	(revision 2781e65c359cf1977f2538da573898134135ea2b)
+++ src/libcfa/concurrency/CtxSwitch-i386.S	(revision 68ac32ea18da20c419e6bf09e1ae72348b4ac166)
@@ -52,4 +52,10 @@
 	movl 4(%esp),%eax
 
+	// Save floating & SSE control words on the stack.
+
+        sub    $8,%esp
+        stmxcsr 0(%esp)         // 4 bytes
+        fnstcw  4(%esp)         // 2 bytes
+
 	// Save volatile registers on the stack.
 
@@ -69,5 +75,5 @@
 	// argument is now at 8 + 12 = 20(%esp)
 
-	movl 20(%esp),%eax
+	movl 28(%esp),%eax
 
 	// Load new context from the "to" area.
@@ -81,4 +87,10 @@
 	popl %edi
 	popl %ebx
+
+	// Load floating & SSE control words from the stack.
+
+        fldcw   4(%esp)
+        ldmxcsr 0(%esp)
+        add    $8,%esp
 
 	// Return to thread.
Index: src/libcfa/concurrency/CtxSwitch-x86_64.S
===================================================================
--- src/libcfa/concurrency/CtxSwitch-x86_64.S	(revision 2781e65c359cf1977f2538da573898134135ea2b)
+++ src/libcfa/concurrency/CtxSwitch-x86_64.S	(revision 68ac32ea18da20c419e6bf09e1ae72348b4ac166)
@@ -47,9 +47,12 @@
 CtxSwitch:
 
-	// Save volatile registers on the stack.
+	// Save floating & SSE control words on the stack.
 
 	subq   $8,%rsp
 	stmxcsr 0(%rsp)         // 4 bytes
 	fnstcw  4(%rsp)         // 2 bytes
+
+	// Save volatile registers on the stack.
+
 	pushq %r15
 	pushq %r14
@@ -75,7 +78,10 @@
 	popq %r14
 	popq %r15
+
+	// Load floating & SSE control words from the stack.
+
 	fldcw   4(%rsp)
 	ldmxcsr 0(%rsp)
-	addq $8,%rsp
+	addq   $8,%rsp
 
 	// Return to thread.
Index: src/libcfa/concurrency/invoke.c
===================================================================
--- src/libcfa/concurrency/invoke.c	(revision 2781e65c359cf1977f2538da573898134135ea2b)
+++ src/libcfa/concurrency/invoke.c	(revision 68ac32ea18da20c419e6bf09e1ae72348b4ac166)
@@ -91,5 +91,7 @@
 	struct FakeStack {
 	    void *fixedRegisters[3];		  	// fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
-	    void *rturn;				      // where to go on return from uSwitch
+	    uint32_t mxcr;                              // SSE Status and Control bits (control bits are preserved across function calls)
+            uint16_t fcw;                               // X97 FPU control word (preserved across function calls)
+	    void *rturn;                                // where to go on return from uSwitch
 	    void *dummyReturn;				// fake return compiler would have pushed on call to uInvoke
 	    void *argument[3];				// for 16-byte ABI, 16-byte alignment starts here
@@ -108,7 +110,7 @@
       struct FakeStack {
             void *fixedRegisters[5];			// fixed registers rbx, r12, r13, r14, r15
-            uint32_t mxcr;			            // SSE Status and Control bits (control bits are preserved across function calls)
-            uint16_t fcw;			            // X97 FPU control word (preserved across function calls)
-            void *rturn;				      // where to go on return from uSwitch
+            uint32_t mxcr;                              // SSE Status and Control bits (control bits are preserved across function calls)
+            uint16_t fcw;                               // X97 FPU control word (preserved across function calls)
+            void *rturn;                                // where to go on return from uSwitch
             void *dummyReturn;				// NULL return address to provide proper alignment
       };
