Index: libcfa/src/concurrency/CtxSwitch-arm.S
===================================================================
--- libcfa/src/concurrency/CtxSwitch-arm.S	(revision 636d115b7b50892534cae9061eb31c837d3678a8)
+++ libcfa/src/concurrency/CtxSwitch-arm.S	(revision c5cbc099ac7827d984d9838a8e0b99386d37cc8e)
@@ -1,62 +1,104 @@
-	@ 32 bit ARM context switch
-	@ This function assumes that r9 has no special meaning on the platform it's
-	@ being built on.
-	@ If r9 is special, uncomment the following line and it will be left alone
+// 
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+// 
+// CtxSwitch-arm.S -- 
+// 
+// Author           : Peter A. Buhr
+// Created On       : Sun Aug 16 07:50:13 2020
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Thu Aug 20 18:43:51 2020
+// Update Count     : 24
+// 
 
-	@ #define R9_SPECIAL
+// The context switch routine requires the initial the stack of a thread to
+// look like the thread has saved its context in the normal manner.
 
-	#define PTR_BYTE        4
-	#define SP_OFFSET       ( 0 * PTR_BYTE )
-	#define FP_OFFSET       ( 1 * PTR_BYTE )
-	#define PC_OFFSET       ( 2 * PTR_BYTE )
+// Offsets must synchronized with the __stack_context_t in invoke.h.
+
+#define PTR_BYTE	8
+#define SP_OFFSET	( 0 * PTR_BYTE )
+#define FP_OFFSET	( 1 * PTR_BYTE )
+
+// Context switch between coroutines/tasks.
+//   void __cfactx_switch( struct __stack_context_t * from, struct __stack_context_t * to ) ;
+// Arguments "from" in register x0, "to" in register x1.
+
+#define SAVE		20 * 8
+
+	.file "CtxSwitch-arm.S"
+	.text
+	.align 2
+	.global __cfactx_switch
+	.type __cfactx_switch, @function
+__cfactx_switch:
+
+	sub  sp, sp, #SAVE					// push stack
+
+	// Save volatile GP registers x19-x30 on the stack.
+
+	stp  x19, x20, [sp, #0x00]
+	stp  x21, x22, [sp, #0x10]
+	stp  x23, x24, [sp, #0x20]
+	stp  x25, x26, [sp, #0x30]
+	stp  x27, x28, [sp, #0x40]
+	stp  x29, x30, [sp, #0x50]			// x29 => fp
+
+	// Save volatile SIMD/FPU registers d8-d15 on the stack.
+
+	stp  d8,  d9,  [sp, #0x60]
+	stp  d10, d11, [sp, #0x70]
+	stp  d12, d13, [sp, #0x80]
+	stp  d14, d15, [sp, #0x90]
+
+	// Save old context in the "from" area.
+
+	mov  x4, sp							// cannot store sp directly
+	str  x4, [x0, #SP_OFFSET]
+	str  fp, [x0, #FP_OFFSET]
+
+	// Load new context from the "to" area.
+
+	ldr  fp, [x1, #FP_OFFSET]
+	ldr  x4, [x1, #SP_OFFSET]
+	mov  sp, x4							// cannot store sp directly
+
+	// Load volatile GP registers x19-x30 from the stack.
+
+	ldp  x19, x20, [sp, #0x00]
+	ldp  x21, x22, [sp, #0x10]
+	ldp  x23, x24, [sp, #0x20]
+	ldp  x25, x26, [sp, #0x30]
+	ldp  x27, x28, [sp, #0x40]
+	ldp  x29, x30, [sp, #0x50]
+	
+	// Load volatile SIMD/FPU registers d8-d15 from the stack.
+
+	ldp  d8,  d9,  [sp, #0x60]
+	ldp  d10, d11, [sp, #0x70]
+	ldp  d12, d13, [sp, #0x80]
+	ldp  d14, d15, [sp, #0x90]
+
+	add  sp, sp, #SAVE					// pop stack
+	ret									// return to new thread (mov pc, x30)
+
+	.size __cfactx_switch, .-__cfactx_switch
+	.section .note.GNU-stack,"",%progbits // mark no executable stack needed
+
+// Stub to create new stacks which can be context switched to
+//   void __cfactx_invoke_stub( void );
 
 	.text
-	.align  2
-	.global __cfactx_switch
-	.type   __cfactx_switch, %function
+	.align 2
+	.global __cfactx_invoke_stub
+	.type __cfactx_invoke_stub, @function
+__cfactx_invoke_stub:
+	mov x0, x19							// load main as parameter 0
+	mov x1, x20							// load this as parameter 1
+	mov x30, x21						// load coroutine invoke routine
+	ret									//   and jmp to it (mov pc, x30)
+	.size __cfactx_invoke_stub, .-__cfactx_invoke_stub
 
-__cfactx_switch:
-	@ save callee-saved registers: r4-r8, r10, r11, r13(sp) (plus r9 depending on platform specification)
-	@ I've seen reference to 31 registers on 64-bit, if this is the case, more need to be saved
-	@ save thread state registers: r14(lr)
-	@ r12(ip) is intra-procedure-call scratch register, does not need saving between function calls
-
-	#ifdef R9_SPECIAL
-	stmfd r13!, {r4-r8,r10,r11,r14}
-	#else
-	stmfd r13!, {r4-r11,r14}
-	#endif // R9_SPECIAL
-
-	@ save floating point registers: s16-s31
-	vstmdb r13!, {s16-s31}
-
-	@ save frame pointer and stack pointer to outgoing datastructure
-	str sp, [r0, #SP_OFFSET]
-	str fp, [r0, #FP_OFFSET]
-
-	@ restore frame pointer and stack pointer from incoming datastructure
-	ldr fp, [r1, #FP_OFFSET]
-	ldr sp, [r1, #SP_OFFSET]
-
-	@ restore floating point registers: s16-s31
-	vldm r13!, {s16-s31}
-	@ restore r14(lr)
-	@ restore 64-bit extra registers?
-	@ restore callee-saved registers: r4-r8, r10, r11, r13
-
-	#ifdef R9_SPECIAL
-	ldmfd r13!, {r4-r8,r10,r11,r15}
-	#else
-	ldmfd r13!, {r4-r11,r14}    @ loading r14 back into r15 returns
-
-	mov r15, r14
-	#endif // R9_SPECIAL
-
-	.text
-	.align  2
-	.global __cfactx_invoke_stub
-	.type   __cfactx_invoke_stub, %function
-
-__cfactx_invoke_stub:
-        ldmfd r13!, {r0-r1}
-	mov r15, r1
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
