Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 8fc652e01c609dd8c3f392cd9de90efc31e0d6b2)
+++ libcfa/src/concurrency/preemption.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Aug 26 16:46:03 2020
-// Update Count     : 53
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 
@@ -163,4 +163,55 @@
 // Kernel Signal Tools
 //=============================================================================================
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+//
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+//
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+//
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+//
+// which generates the following code on the ARM
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:	mrs	x1, tpidr_el0
+//    0x0000000000000614 <+4>:	mov	w0, #0x0                   	// #0
+//    0x0000000000000618 <+8>:	add	x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>:	add	x1, x1, #0x10
+//    0x0000000000000620 <+16>:	str	wzr, [x1]
+//    0x0000000000000624 <+20>:	ret
+//
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+//
+// On the x86, the following code is generated for the same code fragment.
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:	movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>:	xor    %eax,%eax
+//    0x000000000040042e <+14>:	retq
+//
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+//
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+//
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+//
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
 
 //----------
@@ -196,41 +247,4 @@
 	return val;
 }
-
-// //----------
-// // Write data to the TLS block
-// // sadly it looses the type information and can only write 1 word at a time
-// // use with __builtin_offsetof
-// void __cfatls_set(uintptr_t offset, void * value) __attribute__((__noinline__));
-// void __cfatls_set(uintptr_t offset, void * value) {
-//     // create a assembler label before
-//     // marked as clobber all to avoid movement
-//     asm volatile("__cfaasm_set_before:":::"memory");
-
-//     // access tls as normal (except for type information)
-//     *(void**)(offset + (uintptr_t)&my_tls) = value;
-
-//     // create a assembler label after
-//     // marked as clobber all to avoid movement
-//     asm volatile("__cfaasm_set_after:":::"memory");
-// }
-
-// //----------
-// #include <stdio.h>
-// int main() {
-//     // Get the information
-//     // Must use inline assembly to get access to label
-//     // C is annoying here because this could easily be a static const but "initializer element is not a compile-time constant"
-//     // The big advantage of this approach is that there is 0 overhead for the read and writes function
-//     void * __cfaasm_addr_get_before = ({ void * value; asm("movq $__cfaasm_get_before, %[v]\n\t" : [v]"=r"(value) ); value; });
-//     void * __cfaasm_addr_get_after  = ({ void * value; asm("movq $__cfaasm_get_after , %[v]\n\t" : [v]"=r"(value) ); value; });
-//     void * __cfaasm_addr_set_before = ({ void * value; asm("movq $__cfaasm_set_before, %[v]\n\t" : [v]"=r"(value) ); value; });
-//     void * __cfaasm_addr_set_after  = ({ void * value; asm("movq $__cfaasm_set_after , %[v]\n\t" : [v]"=r"(value) ); value; });
-
-//     printf("%p to %p\n", __cfaasm_addr_get_before, __cfaasm_addr_get_after);
-//     printf("%p to %p\n", __cfaasm_addr_set_before, __cfaasm_addr_set_after);
-//     return 0;
-// }
-
-__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
 
 extern "C" {
@@ -494,4 +508,6 @@
 #endif
 
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+
 // Context switch signal handler
 // Receives SIGUSR1 signal and causes the current thread to yield
