Index: src/libcfa/bits/locks.h
===================================================================
--- src/libcfa/bits/locks.h	(revision a1a17a744695781ffbd73827d771f9063f9c11ba)
+++ src/libcfa/bits/locks.h	(revision 13073be42dba9d23e1eb3322d700f265381ae024)
@@ -39,18 +39,14 @@
 #endif
 
-#if __SIZEOF_SIZE_T__ == 8
-	#define __lock_test_and_test_and_set( lock ) (lock) == 0 && __sync_lock_test_and_set_8( &(lock), 1 ) == 0
-	#define __lock_release( lock ) __sync_lock_release_8( &(lock) );
-#elif __SIZEOF_SIZE_T__ == 4
-	#define __lock_test_and_test_and_set( lock ) (lock) == 0 && __sync_lock_test_and_set_4( &(lock), 1 ) == 0
-	#define __lock_release( lock ) __sync_lock_release_4( &(lock) );
-#else
-	#error unsupported architecture
-#endif
-
 struct __spinlock_t {
-	__ALIGN__ volatile size_t lock;
+	// Wrap in struct to prevent false sharing with debug info
+	struct {
+		// Align lock on 128-bit boundary
+		__ALIGN__ volatile _Bool lock;
+	};
 	#ifdef __CFA_DEBUG__
+		// previous function to acquire the lock
 		const char * prev_name;
+		// previous thread to acquire the lock
 		void* prev_thrd;
 	#endif
@@ -78,5 +74,5 @@
 	// Lock the spinlock, return false if already acquired
 	static inline _Bool try_lock  ( __spinlock_t & this __cfaabi_dbg_ctx_param2 ) {
-		_Bool result = __lock_test_and_test_and_set( this.lock );
+		_Bool result = (this.lock == 0) && (__atomic_test_and_set( &this.lock, __ATOMIC_ACQUIRE ) == 0);
 		if( result ) {
 			disable_interrupts();
@@ -94,5 +90,5 @@
 
 		for ( unsigned int i = 1;; i += 1 ) {
-			if ( __lock_test_and_test_and_set( this.lock ) ) break;
+			if ( (this.lock == 0) && (__atomic_test_and_set( &this.lock, __ATOMIC_ACQUIRE ) == 0) ) break;
 			#ifndef NOEXPBACK
 				// exponential spin
@@ -112,20 +108,7 @@
 	}
 
-	// // Lock the spinlock, yield if already acquired
-	// static inline void lock_yield( __spinlock_t & this __cfaabi_dbg_ctx_param2 ) {
-	// 	for ( unsigned int i = 1;; i += 1 ) {
-	// 		if ( __lock_test_and_test_and_set( this.lock ) ) break;
-	// 		yield( i );
-	// 	}
-	// 	disable_interrupts();
-	// 	__cfaabi_dbg_debug_do(
-	// 		this.prev_name = caller;
-	// 		this.prev_thrd = this_thread;
-	// 	)
-	// }
-
 	static inline void unlock( __spinlock_t & this ) {
 		enable_interrupts_noPoll();
-		__lock_release( this.lock );
+		__atomic_clear( &this.lock, __ATOMIC_RELEASE );
 	}
 #endif
Index: src/libcfa/concurrency/preemption.c
===================================================================
--- src/libcfa/concurrency/preemption.c	(revision a1a17a744695781ffbd73827d771f9063f9c11ba)
+++ src/libcfa/concurrency/preemption.c	(revision 13073be42dba9d23e1eb3322d700f265381ae024)
@@ -161,5 +161,14 @@
 	void disable_interrupts() {
 		with( kernelTLS.preemption_state ) {
-			enabled = false;
+			static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+
+			// Set enabled flag to false
+			// should be atomic to avoid preemption in the middle of the operation.
+			// use memory order RELAXED since there is no inter-thread on this variable requirements
+			__atomic_store_n(&enabled, false, __ATOMIC_RELAXED);
+
+			// Signal the compiler that a fence is needed but only for signal handlers
+			__atomic_signal_fence(__ATOMIC_ACQUIRE);
+
 			__attribute__((unused)) unsigned short new_val = disable_count + 1;
 			disable_count = new_val;
@@ -171,6 +180,6 @@
 	// If counter reaches 0, execute any pending CtxSwitch
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
-		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic add
-		thread_desc * thrd = kernelTLS.this_thread;	  // Cache the thread now since interrupts can start happening after the atomic add
+		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		thread_desc * thrd = kernelTLS.this_thread;	  // Cache the thread now since interrupts can start happening after the atomic store
 
 		with( kernelTLS.preemption_state ){
@@ -181,5 +190,13 @@
 			// Check if we need to prempt the thread because an interrupt was missed
 			if( prev == 1 ) {
-				enabled = true;
+				static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+
+				// Set enabled flag to true
+				// should be atomic to avoid preemption in the middle of the operation.
+				// use memory order RELAXED since there is no inter-thread on this variable requirements
+				__atomic_store_n(&enabled, true, __ATOMIC_RELAXED);
+
+				// Signal the compiler that a fence is needed but only for signal handlers
+				__atomic_signal_fence(__ATOMIC_RELEASE);
 				if( proc->pending_preemption ) {
 					proc->pending_preemption = false;
@@ -200,5 +217,12 @@
 		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
 		if( prev == 1 ) {
-			kernelTLS.preemption_state.enabled = true;
+			static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+			// Set enabled flag to true
+			// should be atomic to avoid preemption in the middle of the operation.
+			// use memory order RELAXED since there is no inter-thread on this variable requirements
+			__atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+
+			// Signal the compiler that a fence is needed but only for signal handlers
+			__atomic_signal_fence(__ATOMIC_RELEASE);
 		}
 	}
