Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 3959595fcbab91d221eb882dc5976f75a47b5067)
+++ libcfa/src/concurrency/preemption.cfa	(revision 231b18f299ecfc1eaab7aaa615d70064e3240d08)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Aug 26 16:46:03 2020
-// Update Count     : 53
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 
@@ -163,4 +163,56 @@
 // Kernel Signal Tools
 //=============================================================================================
+
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+// 
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+// 
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+// 
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+// 
+// which generates the following code on the ARM
+// 
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:	mrs	x1, tpidr_el0
+//    0x0000000000000614 <+4>:	mov	w0, #0x0                   	// #0
+//    0x0000000000000618 <+8>:	add	x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>:	add	x1, x1, #0x10
+//    0x0000000000000620 <+16>:	str	wzr, [x1]
+//    0x0000000000000624 <+20>:	ret
+// 
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+// 
+// On the x86, the following code is generated for the same code fragment.
+// 
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:	movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>:	xor    %eax,%eax
+//    0x000000000040042e <+14>:	retq   
+// 
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+// 
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+// 
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+// 
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
 
 __cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
