Index: src/libcfa/concurrency/CtxSwitch-x86_64.S
===================================================================
--- src/libcfa/concurrency/CtxSwitch-x86_64.S	(revision 8e5724eb6690c6626d637781fe0055e8dde51853)
+++ src/libcfa/concurrency/CtxSwitch-x86_64.S	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
@@ -0,0 +1,90 @@
+//                               -*- Mode: Asm -*- 
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// CtxSwitch-x86_64.S --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov 28 12:27:26 2016
+// Last Modified By : Thierry Delisle
+// Last Modified On : Mon Nov 28 12:27:26 2016
+// Update Count     : 0
+//
+// This  library is free  software; you  can redistribute  it and/or  modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software  Foundation; either  version 2.1 of  the License, or  (at your
+// option) any later version.
+// 
+// This library is distributed in the  hope that it will be useful, but WITHOUT
+// ANY  WARRANTY;  without even  the  implied  warranty  of MERCHANTABILITY  or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+// 
+// You should  have received a  copy of the  GNU Lesser General  Public License
+// along  with this library.
+// 
+
+// This context switch routine depends on the fact that the stack of a new
+// thread has been set up to look like the thread has saved its context in
+// the normal manner.
+//
+// void CtxSwitch( machine_context *from, machine_context *to );
+
+// Offsets in the context structure. This needs to be synchronized with the
+// high level code a little better.
+
+#define PTR_BYTE	8
+#define SP_OFFSET	( 0 * PTR_BYTE )
+#define FP_OFFSET	( 1 * PTR_BYTE )
+#define PC_OFFSET	( 2 * PTR_BYTE )
+
+.text
+	.align 2
+.globl	CtxSwitch
+CtxSwitch:
+
+	// Save volatile registers on the stack.
+
+	pushq %r15
+	pushq %r14
+	pushq %r13
+	pushq %r12
+	pushq %rbx
+
+	// Save old context in the "from" area.
+
+	movq %rsp,SP_OFFSET(%rdi)
+	movq %rbp,FP_OFFSET(%rdi)
+
+	// Load new context from the "to" area.
+
+	movq SP_OFFSET(%rsi),%rsp
+	movq FP_OFFSET(%rsi),%rbp
+
+	// Load volatile registers from the stack.
+
+	popq %rbx
+	popq %r12
+	popq %r13
+	popq %r14
+	popq %r15
+
+	// Return to thread.
+
+	ret
+
+.text
+	.align 2
+.globl	coInvokeStub
+coInvokeStub:
+	movq %rbx, %rdi
+	movq %r12, %rsi 
+	jmp *%r13
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: src/libcfa/concurrency/invoke.c
===================================================================
--- src/libcfa/concurrency/invoke.c	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
+++ src/libcfa/concurrency/invoke.c	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
@@ -0,0 +1,65 @@
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "invoke.h"
+
+struct machine_context_t {
+	void *SP;
+	void *FP;
+	void *PC;
+};
+
+extern void coInvokeStub( void );
+
+// magically invoke the "main" of the most derived class
+// Called from the kernel when starting a coroutine or task so must switch back to user mode.
+void __invokeCoroutine__F_P9scoVtablePv__1(struct coVtable *vtable, void* vthis)
+{
+      printf("Invoke : Received %p (v %p)\n", vthis, vtable);
+
+      struct coroutine* cor = vtable->this_coroutine(vthis);
+
+      cor->state = Active;
+
+      vtable->main(vthis);
+}
+
+void __startCoroutine__A0_1_0___this_coroutine__PFP10scoroutine_Pd0___co_main__PF_Pd0___vtable__PFP9scoVtable_Pd0__F_Pd0PF_P9scoVtablePv___1(
+      struct coroutine *(*this_coroutine)(void * ), 
+      void (*co_main)(void *), 
+      struct coVtable *(*get_vtable)(void *), 
+      void *vthis, 
+      void (*invoke)(struct coVtable *, void *)
+) {
+
+      #if ! defined( __x86_64__ )
+            #error Only __x86_64__ is supported for threads in cfa
+      #endif
+
+      #if defined( __U_SWAPCONTEXT__ )
+            #error __U_SWAPCONTEXT__ should not be defined for __x86_64__
+      #endif
+
+      struct coVtable * vtable = get_vtable( vthis );
+      struct coroutine* this = this_coroutine( vthis );
+      struct coStack_t* stack = &this->stack;
+
+      struct FakeStack {
+            void *fixedRegisters[5];			// fixed registers rbx, r12, r13, r14, r15
+            void *rturn;					// where to go on return from uSwitch
+            void *dummyReturn;				// NULL return address to provide proper alignment
+      }; // FakeStack
+
+      ((struct machine_context_t *)stack->context)->SP = (char *)stack->base - sizeof( struct FakeStack );
+      ((struct machine_context_t *)stack->context)->FP = NULL;		// terminate stack with NULL fp
+
+      fprintf(stderr, "StartCoroutine : Passing in %p (v %p) to %p\n", vthis, vtable, invoke);
+
+      ((struct FakeStack *)(((struct machine_context_t *)stack->context)->SP))->dummyReturn = NULL;
+      ((struct FakeStack *)(((struct machine_context_t *)stack->context)->SP))->rturn = coInvokeStub;
+      ((struct FakeStack *)(((struct machine_context_t *)stack->context)->SP))->fixedRegisters[0] = vtable;
+      ((struct FakeStack *)(((struct machine_context_t *)stack->context)->SP))->fixedRegisters[1] = vthis;
+      ((struct FakeStack *)(((struct machine_context_t *)stack->context)->SP))->fixedRegisters[2] = invoke;
+}
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
+++ src/libcfa/concurrency/invoke.h	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
@@ -0,0 +1,34 @@
+
+#ifndef _INVOKE_H_
+#define _INVOKE_H_
+
+struct coVtable {
+      void (*main)(void*);
+      struct coroutine* (*this_coroutine)(void*);
+};
+
+struct coStack_t {
+      unsigned int size;		// size of stack
+      void *storage;			// pointer to stack
+      void *limit;			// stack grows towards stack limit
+      void *base;				// base of stack
+      void *context;			// address of cfa_context_t
+      void *top;				// address of top of storage
+      bool userStack;	
+};
+
+
+enum coroutine_state { Start, Inactive, Active, Halt };
+
+struct coroutine {
+      struct coStack_t stack;
+      const char *name;			// textual name for coroutine/task, initialized by uC++ generated code
+      int errno_;				// copy of global UNIX variable errno
+      enum coroutine_state state;	      // current execution status for coroutine
+      bool notHalted;			// indicate if execuation state is not halted
+
+      struct coroutine *starter;		// first coroutine to resume this one
+      struct coroutine *last;			// last coroutine to resume this one
+};
+
+#endif //_INVOKE_H_
Index: src/libcfa/concurrency/threads
===================================================================
--- src/libcfa/concurrency/threads	(revision 8e5724eb6690c6626d637781fe0055e8dde51853)
+++ src/libcfa/concurrency/threads	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
@@ -1,15 +1,16 @@
+//                              -*- Mode: CFA -*-
 //
-// Cforall Version 1.0.0 Copyright (C) 2015 University of Waterloo
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
-// fstream --
+// threads --
 //
-// Author           : Peter A. Buhr
-// Created On       : Wed May 27 17:56:53 2015
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Apr 28 08:08:04 2016
-// Update Count     : 88
+// Author           : Thierry Delisle
+// Created On       : Mon Nov 28 12:27:26 2016
+// Last Modified By : Thierry Delisle
+// Last Modified On : Mon Nov 28 12:27:26 2016
+// Update Count     : 0
 //
 
@@ -19,9 +20,36 @@
 #include <stdbool.h>
 
-struct coroutine {
-      coroutine* last;
-      const char* name;
-      bool notHalted;
-};
+extern "C" {
+      struct coVtable {
+            void (*main)(void*);
+            struct coroutine* (*this_coroutine)(void*);
+      };
+
+      struct coStack_t {
+            unsigned int size;		// size of stack
+            void *storage;			// pointer to stack
+            void *limit;			// stack grows towards stack limit
+            void *base;				// base of stack
+            void *context;			// address of cfa_context_t
+            void *top;				// address of top of storage
+            bool userStack;	
+      };
+
+
+      enum coroutine_state { Start, Inactive, Active, Halt };
+
+      struct coroutine {
+            coStack_t stack;
+            const char *name;			// textual name for coroutine/task, initialized by uC++ generated code
+            int errno_;				// copy of global UNIX variable errno
+            coroutine_state state;	      // current execution status for coroutine
+            bool notHalted;			// indicate if execuation state is not halted
+
+            coroutine *starter;		// first coroutine to resume this one
+            coroutine *last;			// last coroutine to resume this one
+      };
+}
+
+void ?{}(coStack_t* this);
 
 void ?{}(coroutine* this);
@@ -29,5 +57,10 @@
 trait coroutine_t(dtype T) {
       coroutine* this_coroutine(T* this);
+      void co_main(T* this);
+      coVtable* vtable(T* this);
 };
+
+forall(dtype T | coroutine_t(T))
+void start(T* cor);
 
 void suspend(void);
@@ -37,2 +70,7 @@
 
 #endif //__THREADS_H__
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: src/libcfa/concurrency/threads.c
===================================================================
--- src/libcfa/concurrency/threads.c	(revision 8e5724eb6690c6626d637781fe0055e8dde51853)
+++ src/libcfa/concurrency/threads.c	(revision 78b3f524dae38189ed4f479ebdc4c6f51903a1f1)
@@ -1,14 +1,117 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// threads.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov 28 12:27:26 2016
+// Last Modified By : Thierry Delisle
+// Last Modified On : Mon Nov 28 12:27:26 2016
+// Update Count     : 0
+//
+
+extern "C" {
+#include <stddef.h>
+#include <malloc.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+}
+
 #include "threads"
 #include "assert"
+#include "libhdr.h"
 
-#include <stddef.h>
+extern "C" { extern void coInvokeStub( void ); }
 
-#include <fstream>
- 
+// minimum feasible stack size in bytes
+#define MinStackSize 1000
+
 static coroutine main_coroutine;
 static coroutine* current_coroutine = &main_coroutine;
 
-void ctxSwitchDirect(void* src, void* dst) {
-	current_coroutine = dst;
+extern "C" {
+	struct machine_context_t {
+		void *SP;
+		void *FP;
+		void *PC;
+	};
+}
+
+extern "C" { void CtxSwitch( void *from, void *to ) asm ("CtxSwitch"); }// assembler routine that performs the context switch
+
+static size_t pageSize = 0;				// architecture pagesize
+
+void ctxSwitchDirect(coroutine* src, coroutine* dst) {
+	// THREAD_GETMEM( This )->disableInterrupts();
+
+	// set state of current coroutine to inactive
+	src->state = Inactive;
+
+	// set new coroutine that task is executing
+	current_coroutine = dst;			
+
+	// context switch to specified coroutine
+	CtxSwitch( src->stack.context, dst->stack.context );
+	// when CtxSwitch returns we are back in the src coroutine		
+
+	// set state of new coroutine to active
+	src->state = Active;
+
+	// THREAD_GETMEM( This )->enableInterrupts();
+} //ctxSwitchDirect
+
+void invokeCoroutine(coVtable* vtable, void* this);
+
+forall(dtype T | coroutine_t(T))
+void startCoroutine(T* this, void (*invoke)(coVtable*, void*));
+
+// used by all constructors
+void create_stack( coStack_t* this, unsigned int storageSize ) {
+	//TEMP HACK do this on proper kernel startup
+	if(pageSize == 0ul) pageSize = sysconf( _SC_PAGESIZE );
+
+	size_t cxtSize = libCeiling( sizeof(machine_context_t), 8 ); // minimum alignment
+
+	if ( (intptr_t)this->storage == 0 ) {
+		this->userStack = false;
+		this->size = libCeiling( storageSize, 16 );
+		// use malloc/memalign because "new" raises an exception for out-of-memory
+		
+		// assume malloc has 8 byte alignment so add 8 to allow rounding up to 16 byte alignment
+		LIB_DEBUG_DO( this->storage = memalign( pageSize, cxtSize + this->size + pageSize ) );
+		LIB_NO_DEBUG_DO( this->storage = malloc( cxtSize + this->size + 8 ) );
+
+		LIB_DEBUG_DO(
+			if ( mprotect( this->storage, pageSize, PROT_NONE ) == -1 ) {
+				abortf( "(uMachContext &)%p.createContext() : internal error, mprotect failure, error(%d) %s.", this, (int)errno, strerror( (int)errno ) );
+			} // if
+		);
+
+		if ( (intptr_t)this->storage == 0 ) {
+			abortf( "Attempt to allocate %d bytes of storage for coroutine or task execution-state but insufficient memory available.", this->size );
+		} // if
+
+		LIB_DEBUG_DO( this->limit = (char *)this->storage + pageSize );
+		LIB_NO_DEBUG_DO( this->limit = (char *)libCeiling( (unsigned long)this->storage, 16 ) ); // minimum alignment
+
+	} else {
+		assertf( ((size_t)this->storage & (libAlign() - 1)) != 0ul, "Stack storage %p for task/coroutine must be aligned on %d byte boundary.", this->storage, (int)libAlign() );
+		this->userStack = true;
+		this->size = storageSize - cxtSize;
+
+		if ( this->size % 16 != 0u ) this->size -= 8;
+
+		this->limit = (char *)libCeiling( (unsigned long)this->storage, 16 ); // minimum alignment
+	} // if
+	assertf( this->size >= MinStackSize, "Stack size %d provides less than minimum of %d bytes for a stack.", this->size, MinStackSize );
+
+	this->base = (char *)this->limit + this->size;
+	this->context = this->base;
+	this->top = (char *)this->context + cxtSize;
 }
 
@@ -17,9 +120,28 @@
 }
 
+void ?{}(coStack_t* this) {
+	this->size		= 10240;	// size of stack
+	this->storage	= NULL;	// pointer to stack
+	this->limit		= NULL;	// stack grows towards stack limit
+	this->base		= NULL;	// base of stack
+	this->context	= NULL;	// address of cfa_context_t
+	this->top		= NULL;	// address of top of storage
+	this->userStack	= false;	
+	create_stack(this, this->size);
+}
+
 void ?{}(coroutine* this)
 {
+	this->name = "Anonymous Coroutine";
+	this->errno_ = 0;
+	this->state = Start;
+      this->notHalted = true;
+	this->starter = NULL;
 	this->last = NULL;
-      this->name = "A Coroutine";
-      this->notHalted = true;
+}
+
+forall(dtype T | coroutine_t(T))
+void start(T* this) {
+	startCoroutine(this, invokeCoroutine);
 }
 
@@ -27,5 +149,5 @@
       coroutine* src = this_coroutine();		// optimization
 
-	assertf( src->last == (coroutine*)0, 
+	assertf( src->last != 0, 
 		"Attempt to suspend coroutine %.256s (%p) that has never been resumed.\n"
 		"Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
@@ -44,4 +166,5 @@
 	coroutine* dst = this_coroutine(cor);
 
+	fprintf(stderr, "Resuming %p from %p\n", dst, src);
 	if ( src != dst ) {				// not resuming self ?
 		assertf( dst->notHalted , 
@@ -49,6 +172,12 @@
 			"Possible cause is terminated coroutine's main routine has already returned.",
 			src->name, src, dst->name, dst );
+		fprintf(stderr, "Assigning last pointer\n");
 		dst->last = src;					// set last resumer
 	} // if
 	ctxSwitchDirect( src, dst );				// always done for performance testing
 }
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
