Index: benchmark/basic/fetch_add.c
===================================================================
--- benchmark/basic/fetch_add.c	(revision a21dec46515be7f1baa5f03d23f83fe06d3a2425)
+++ benchmark/basic/fetch_add.c	(revision 525b5ef9b1406d81dcf21e9cc5dbdaa16ab2df71)
@@ -2,4 +2,7 @@
 
 #include "bench.h"
+
+// Does a "lock add" on entry and a "lock sub" on exit => 2 atomic instructions making it the most expensive
+// atomic test.
 
 volatile int value;
Index: benchmark/basic/tls_fetch_add.c
===================================================================
--- benchmark/basic/tls_fetch_add.c	(revision a21dec46515be7f1baa5f03d23f83fe06d3a2425)
+++ benchmark/basic/tls_fetch_add.c	(revision 525b5ef9b1406d81dcf21e9cc5dbdaa16ab2df71)
@@ -3,4 +3,7 @@
 
 #include "bench.h"
+
+// Does not do a fetch & add. It mimics the cfa protocol to disable interrupts locally, by writing true or false to a
+// thread_local Boolean. This means the entire protocol is just to "mov" instructions making it extremely cheap.
 
 #define thread_local _Thread_local
Index: benchmark/basic/ttst_lock.c
===================================================================
--- benchmark/basic/ttst_lock.c	(revision a21dec46515be7f1baa5f03d23f83fe06d3a2425)
+++ benchmark/basic/ttst_lock.c	(revision 525b5ef9b1406d81dcf21e9cc5dbdaa16ab2df71)
@@ -3,4 +3,7 @@
 
 #include "bench.h"
+
+// Does a "lock xchg" on entry but a simple "mov" on exit => cheaper as 0 contention. While it has much more code, the
+// bulk is never run.
 
 #define CALIGN __attribute__(( aligned (CACHE_ALIGN) ))