Index: libcfa/src/concurrency/channel.hfa
===================================================================
--- libcfa/src/concurrency/channel.hfa	(revision efdd18cb86b35dfd17a92e94494ea3c0fc7c6f8d)
+++ libcfa/src/concurrency/channel.hfa	(revision a45e21c53f79653fd83bc2f16041261ec743736a)
@@ -3,130 +3,5 @@
 #include <locks.hfa>
 #include <list.hfa>
-
-#define __COOP_CHANNEL
-#ifdef __PREVENTION_CHANNEL
-forall( T ) {
-struct channel {
-    size_t size, count, front, back;
-    T * buffer;
-    thread$ * chair;
-    T * chair_elem;
-    exp_backoff_then_block_lock c_lock, p_lock;
-    __spinlock_t mutex_lock;
-    char __padding[64]; // avoid false sharing in arrays of channels
-};
-
-static inline void ?{}( channel(T) &c, size_t _size ) with(c) {
-    size = _size;
-    front = back = count = 0;
-    buffer = aalloc( size );
-    chair = 0p;
-    mutex_lock{};
-    c_lock{};
-    p_lock{};
-}
-
-static inline void ?{}( channel(T) &c ){ ((channel(T) &)c){ 0 }; }
-static inline void ^?{}( channel(T) &c ) with(c) { delete( buffer ); }
-static inline size_t get_count( channel(T) & chan ) with(chan) { return count; }
-static inline size_t get_size( channel(T) & chan ) with(chan) { return size; }
-static inline bool has_waiters( channel(T) & chan ) with(chan) { return chair != 0p; }
-
-static inline void insert_( channel(T) & chan, T & elem ) with(chan) {
-    memcpy((void *)&buffer[back], (void *)&elem, sizeof(T));
-    count += 1;
-    back++;
-    if ( back == size ) back = 0;
-}
-
-static inline void insert( channel(T) & chan, T elem ) with( chan ) {
-    lock( p_lock );
-    lock( mutex_lock __cfaabi_dbg_ctx2 );
-
-    // have to check for the zero size channel case
-    if ( size == 0 && chair != 0p ) {
-        memcpy((void *)chair_elem, (void *)&elem, sizeof(T));
-        unpark( chair );
-        chair = 0p;
-        unlock( mutex_lock );
-        unlock( p_lock );
-        unlock( c_lock );
-        return;
-    }
-
-    // wait if buffer is full, work will be completed by someone else
-    if ( count == size ) {
-        chair = active_thread();
-        chair_elem = &elem;
-        unlock( mutex_lock );
-        park( );
-        return;
-    } // if
-
-    if ( chair != 0p ) {
-        memcpy((void *)chair_elem, (void *)&elem, sizeof(T));
-        unpark( chair );
-        chair = 0p;
-        unlock( mutex_lock );
-        unlock( p_lock );
-        unlock( c_lock );
-        return;
-    }
-    insert_( chan, elem );
-
-    unlock( mutex_lock );
-    unlock( p_lock );
-}
-
-static inline T remove( channel(T) & chan ) with(chan) {
-    lock( c_lock );
-    lock( mutex_lock __cfaabi_dbg_ctx2 );
-    T retval;
-
-    // have to check for the zero size channel case
-    if ( size == 0 && chair != 0p ) {
-        memcpy((void *)&retval, (void *)chair_elem, sizeof(T));
-        unpark( chair );
-        chair = 0p;
-        unlock( mutex_lock );
-        unlock( p_lock );
-        unlock( c_lock );
-        return retval;
-    }
-
-    // wait if buffer is empty, work will be completed by someone else
-    if ( count == 0 ) {
-        chair = active_thread();
-        chair_elem = &retval;
-        unlock( mutex_lock );
-        park( );
-        return retval;
-    }
-
-    // Remove from buffer
-    memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
-    count -= 1;
-    front++;
-    if ( front == size ) front = 0;
-
-    if ( chair != 0p ) {
-        insert_( chan, *chair_elem );  // do waiting producer work
-        unpark( chair );
-        chair = 0p;
-        unlock( mutex_lock );
-        unlock( p_lock );
-        unlock( c_lock );
-        return retval;
-    }
-
-    unlock( mutex_lock );
-    unlock( c_lock );
-    return retval;
-}
-
-} // forall( T )
-#endif
-
-#ifdef __COOP_CHANNEL
+#include <mutex_stmt.hfa>
 
 // link field used for threads waiting on channel
@@ -148,12 +23,46 @@
 }
 
+// wake one thread from the list
+static inline void wake_one( dlist( wait_link ) & queue ) {
+    wait_link & popped = try_pop_front( queue );
+    unpark( popped.t );
+}
+
+// returns true if woken due to shutdown
+// blocks thread on list and releases passed lock
+static inline bool block( dlist( wait_link ) & queue, void * elem_ptr, go_mutex & lock ) {
+    wait_link w{ active_thread(), elem_ptr };
+    insert_last( queue, w );
+    unlock( lock );
+    park();
+    return w.elem == 0p;
+}
+
+// void * used for some fields since exceptions don't work with parametric polymorphism currently
+exception channel_closed {
+    // on failed insert elem is a ptr to the element attempting to be inserted
+    // on failed remove elem ptr is 0p
+    // on resumption of a failed insert this elem will be inserted
+    // so a user may modify it in the resumption handler
+    void * elem;
+
+    // pointer to chan that is closed
+    void * closed_chan;
+};
+vtable(channel_closed) channel_closed_vt;
+
+// #define CHAN_STATS // define this to get channel stats printed in dtor
+
 forall( T ) {
 
-struct channel {
-    size_t size;
-    size_t front, back, count;
+struct __attribute__((aligned(128))) channel {
+    size_t size, front, back, count;
     T * buffer;
-    dlist( wait_link ) prods, cons;
-    exp_backoff_then_block_lock mutex_lock;
+    dlist( wait_link ) prods, cons; // lists of blocked threads
+    go_mutex mutex_lock;            // MX lock
+    bool closed;                    // indicates channel close/open
+    #ifdef CHAN_STATS
+    size_t blocks, operations;      // counts total ops and ops resulting in a blocked thd
+    #endif
 };
 
@@ -165,8 +74,19 @@
     cons{};
     mutex_lock{};
+    closed = false;
+    #ifdef CHAN_STATS
+    blocks = 0;
+    operations = 0;
+    #endif
 }
 
 static inline void ?{}( channel(T) &c ){ ((channel(T) &)c){ 0 }; }
-static inline void ^?{}( channel(T) &c ) with(c) { delete( buffer ); }
+static inline void ^?{}( channel(T) &c ) with(c) {
+    #ifdef CHAN_STATS
+    printf("Channel %p Blocks: %lu, Operations: %lu, %.2f%% of ops blocked\n", &c, blocks, operations, ((double)blocks)/operations * 100);
+    #endif
+    verifyf( cons`isEmpty && prods`isEmpty, "Attempted to delete channel with waiting threads (Deadlock).\n" );
+    delete( buffer );
+}
 static inline size_t get_count( channel(T) & chan ) with(chan) { return count; }
 static inline size_t get_size( channel(T) & chan ) with(chan) { return size; }
@@ -175,5 +95,34 @@
 static inline bool has_waiting_producers( channel(T) & chan ) with(chan) { return !prods`isEmpty; }
 
-static inline void insert_( channel(T) & chan, T & elem ) with(chan) {
+// closes the channel and notifies all blocked threads
+static inline void close( channel(T) & chan ) with(chan) {
+    lock( mutex_lock );
+    closed = true;
+
+    // flush waiting consumers and producers
+    while ( has_waiting_consumers( chan ) ) {
+        cons`first.elem = 0p;
+        wake_one( cons );
+    }
+    while ( has_waiting_producers( chan ) ) {
+        prods`first.elem = 0p;
+        wake_one( prods );
+    }
+    unlock(mutex_lock);
+}
+
+static inline void is_closed( channel(T) & chan ) with(chan) { return closed; }
+
+static inline void flush( channel(T) & chan, T elem ) with(chan) {
+    lock( mutex_lock );
+    while ( count == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
+        wake_one( cons );
+    }
+    unlock( mutex_lock );
+}
+
+// handles buffer insert
+static inline void __buf_insert( channel(T) & chan, T & elem ) with(chan) {
     memcpy((void *)&buffer[back], (void *)&elem, sizeof(T));
     count += 1;
@@ -182,18 +131,54 @@
 }
 
-static inline void wake_one( dlist( wait_link ) & queue ) {
-    wait_link & popped = try_pop_front( queue );
-    unpark( popped.t );
-}
-
-static inline void block( dlist( wait_link ) & queue, void * elem_ptr, exp_backoff_then_block_lock & lock ) {
-    wait_link w{ active_thread(), elem_ptr };
-    insert_last( queue, w );
-    unlock( lock );
-    park();
+// does the buffer insert or hands elem directly to consumer if one is waiting
+static inline void __do_insert( channel(T) & chan, T & elem ) with(chan) {
+    if ( count == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
+        wake_one( cons );
+    } else __buf_insert( chan, elem );
+}
+
+// needed to avoid an extra copy in closed case
+static inline bool __internal_try_insert( channel(T) & chan, T & elem ) with(chan) {
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    operations++;
+    #endif
+    if ( count == size ) { unlock( mutex_lock ); return false; }
+    __do_insert( chan, elem );
+    unlock( mutex_lock );
+    return true;
+}
+
+// attempts a nonblocking insert
+// returns true if insert was successful, false otherwise
+static inline bool try_insert( channel(T) & chan, T elem ) { return __internal_try_insert( chan, elem ); }
+
+// handles closed case of insert routine
+static inline void __closed_insert( channel(T) & chan, T & elem ) with(chan) {
+    channel_closed except{&channel_closed_vt, &elem, &chan };
+    throwResume except; // throw closed resumption
+    if ( !__internal_try_insert( chan, elem ) ) throw except; // if try to insert fails (would block), throw termination
 }
 
 static inline void insert( channel(T) & chan, T elem ) with(chan) {
-    lock( mutex_lock );
+    // check for close before acquire mx
+    if ( unlikely(closed) ) {
+        __closed_insert( chan, elem );
+        return;
+    } 
+
+    lock( mutex_lock );
+
+    #ifdef CHAN_STATS
+    if ( !closed ) operations++;
+    #endif
+
+    // if closed handle
+    if ( unlikely(closed) ) {
+        unlock( mutex_lock );
+        __closed_insert( chan, elem );
+        return;
+    }
 
     // have to check for the zero size channel case
@@ -202,10 +187,16 @@
         wake_one( cons );
         unlock( mutex_lock );
-        return;
+        return true;
     }
 
     // wait if buffer is full, work will be completed by someone else
     if ( count == size ) {
-        block( prods, &elem, mutex_lock );
+        #ifdef CHAN_STATS
+        blocks++;
+        #endif
+
+        // check for if woken due to close
+        if ( unlikely( block( prods, &elem, mutex_lock ) ) )
+            __closed_insert( chan, elem );
         return;
     } // if
@@ -214,12 +205,76 @@
         memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
         wake_one( cons );
-    } else insert_( chan, elem );
+    } else __buf_insert( chan, elem );
     
     unlock( mutex_lock );
+    return;
+}
+
+// handles buffer remove
+static inline void __buf_remove( channel(T) & chan, T & retval ) with(chan) {
+    memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
+    count -= 1;
+    front = (front + 1) % size;
+}
+
+// does the buffer remove and potentially does waiting producer work
+static inline void __do_remove( channel(T) & chan, T & retval ) with(chan) {
+    __buf_remove( chan, retval );
+    if (count == size - 1 && !prods`isEmpty ) {
+        __buf_insert( chan, *(T *)prods`first.elem );  // do waiting producer work
+        wake_one( prods );
+    }
+}
+
+// needed to avoid an extra copy in closed case and single return val case
+static inline bool __internal_try_remove( channel(T) & chan, T & retval ) with(chan) {
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    operations++;
+    #endif
+    if ( count == 0 ) { unlock( mutex_lock ); return false; }
+    __do_remove( chan, retval );
+    unlock( mutex_lock );
+    return true;
+}
+
+// attempts a nonblocking remove
+// returns [T, true] if insert was successful
+// returns [T, false] if insert was successful (T uninit)
+static inline [T, bool] try_remove( channel(T) & chan ) {
+    T retval;
+    return [ retval, __internal_try_remove( chan, retval ) ];
+}
+
+static inline T try_remove( channel(T) & chan, T elem ) {
+    T retval;
+    __internal_try_remove( chan, retval );
+    return retval;
+}
+
+// handles closed case of insert routine
+static inline void __closed_remove( channel(T) & chan, T & retval ) with(chan) {
+    channel_closed except{&channel_closed_vt, 0p, &chan };
+    throwResume except; // throw resumption
+    if ( !__internal_try_remove( chan, retval ) ) throw except; // if try to remove fails (would block), throw termination
 }
 
 static inline T remove( channel(T) & chan ) with(chan) {
-    lock( mutex_lock );
     T retval;
+    if ( unlikely(closed) ) {
+        __closed_remove( chan, retval );
+        return retval;
+    } 
+    lock( mutex_lock );
+
+    #ifdef CHAN_STATS
+    if ( !closed ) operations++;
+    #endif
+
+    if ( unlikely(closed) ) {
+        unlock( mutex_lock );
+        __closed_remove( chan, retval );
+        return retval;
+    } 
 
     // have to check for the zero size channel case
@@ -233,17 +288,15 @@
     // wait if buffer is empty, work will be completed by someone else
     if (count == 0) {
-        block( cons, &retval, mutex_lock );
+        #ifdef CHAN_STATS
+        blocks++;
+        #endif
+        // check for if woken due to close
+        if ( unlikely( block( cons, &retval, mutex_lock ) ) )
+            __closed_remove( chan, retval );
         return retval;
     }
 
     // Remove from buffer
-    memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
-    count -= 1;
-    front = (front + 1) % size;
-
-    if (count == size - 1 && !prods`isEmpty ) {
-        insert_( chan, *(T *)prods`first.elem );  // do waiting producer work
-        wake_one( prods );
-    }
+    __do_remove( chan, retval );
 
     unlock( mutex_lock );
@@ -251,149 +304,2 @@
 }
 } // forall( T )
-#endif
-
-#ifdef __BARGE_CHANNEL
-forall( T ) {
-struct channel {
-    size_t size;
-    size_t front, back, count;
-    T * buffer;
-    fast_cond_var( exp_backoff_then_block_lock ) prods, cons;
-    exp_backoff_then_block_lock mutex_lock;
-};
-
-static inline void ?{}( channel(T) &c, size_t _size ) with(c) {
-    size = _size;
-    front = back = count = 0;
-    buffer = aalloc( size );
-    prods{};
-    cons{};
-    mutex_lock{};
-}
-
-static inline void ?{}( channel(T) &c ){ ((channel(T) &)c){ 0 }; }
-static inline void ^?{}( channel(T) &c ) with(c) { delete( buffer ); }
-static inline size_t get_count( channel(T) & chan ) with(chan) { return count; }
-static inline size_t get_size( channel(T) & chan ) with(chan) { return size; }
-static inline bool has_waiters( channel(T) & chan ) with(chan) { return !empty( cons ) || !empty( prods ); }
-static inline bool has_waiting_consumers( channel(T) & chan ) with(chan) { return !empty( cons ); }
-static inline bool has_waiting_producers( channel(T) & chan ) with(chan) { return !empty( prods ); }
-
-static inline void insert_( channel(T) & chan, T & elem ) with(chan) {
-    memcpy((void *)&buffer[back], (void *)&elem, sizeof(T));
-    count += 1;
-    back++;
-    if ( back == size ) back = 0;
-}
-
-
-static inline void insert( channel(T) & chan, T elem ) with(chan) {
-    lock( mutex_lock );
-
-    while ( count == size ) { 
-        wait( prods, mutex_lock );
-    } // if
-
-    insert_( chan, elem );
-    
-    if ( !notify_one( cons ) && count < size )
-        notify_one( prods );
-
-    unlock( mutex_lock );
-}
-
-static inline T remove( channel(T) & chan ) with(chan) {
-    lock( mutex_lock );
-    T retval;
-
-    while (count == 0) { 
-        wait( cons, mutex_lock );
-    }
-
-    memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
-    count -= 1;
-    front = (front + 1) % size;
-
-    if ( !notify_one( prods ) && count > 0 )
-        notify_one( cons );
-
-    unlock( mutex_lock );
-    return retval;
-}
-
-} // forall( T )
-#endif
-
-#ifdef __NO_WAIT_CHANNEL
-forall( T ) {
-struct channel {
-    size_t size;
-    size_t front, back, count;
-    T * buffer;
-    thread$ * chair;
-    T * chair_elem;
-    exp_backoff_then_block_lock c_lock, p_lock;
-    __spinlock_t mutex_lock;
-};
-
-static inline void ?{}( channel(T) &c, size_t _size ) with(c) {
-    size = _size;
-    front = back = count = 0;
-    buffer = aalloc( size );
-    chair = 0p;
-    mutex_lock{};
-    c_lock{};
-    p_lock{};
-    lock( c_lock );
-}
-
-static inline void ?{}( channel(T) &c ){ ((channel(T) &)c){ 0 }; }
-static inline void ^?{}( channel(T) &c ) with(c) { delete( buffer ); }
-static inline size_t get_count( channel(T) & chan ) with(chan) { return count; }
-static inline size_t get_size( channel(T) & chan ) with(chan) { return size; }
-static inline bool has_waiters( channel(T) & chan ) with(chan) { return c_lock.lock_value != 0; }
-
-static inline void insert_( channel(T) & chan, T & elem ) with(chan) {
-    memcpy((void *)&buffer[back], (void *)&elem, sizeof(T));
-    count += 1;
-    back++;
-    if ( back == size ) back = 0;
-}
-
-static inline void insert( channel(T) & chan, T elem ) with( chan ) {
-    lock( p_lock );
-    lock( mutex_lock __cfaabi_dbg_ctx2 );
-
-    insert_( chan, elem );
-
-    if ( count != size )
-        unlock( p_lock );
-
-    if ( count == 1 )
-        unlock( c_lock );
-        
-    unlock( mutex_lock );
-}
-
-static inline T remove( channel(T) & chan ) with(chan) {
-    lock( c_lock );
-    lock( mutex_lock __cfaabi_dbg_ctx2 );
-    T retval;
-
-    // Remove from buffer
-    memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
-    count -= 1;
-    front = (front + 1) % size;
-
-    if ( count != 0 )
-        unlock( c_lock );
-
-    if ( count == size - 1 )
-        unlock( p_lock );
-        
-    unlock( mutex_lock );
-    return retval;
-}
-
-} // forall( T )
-#endif
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision efdd18cb86b35dfd17a92e94494ea3c0fc7c6f8d)
+++ libcfa/src/concurrency/locks.hfa	(revision a45e21c53f79653fd83bc2f16041261ec743736a)
@@ -32,5 +32,4 @@
 #include <fstream.hfa>
 
-
 // futex headers
 #include <linux/futex.h>      /* Definition of FUTEX_* constants */
@@ -155,5 +154,4 @@
 // futex_mutex
 
-// - No cond var support
 // - Kernel thd blocking alternative to the spinlock
 // - No ownership (will deadlock on reacq)
@@ -185,16 +183,14 @@
 	int state;
 
-	
-	// // linear backoff omitted for now
-	// for( int spin = 4; spin < 1024; spin += spin) {
-	// 	state = 0;
-	// 	// if unlocked, lock and return
-	// 	if (internal_try_lock(this, state)) return;
-	// 	if (2 == state) break;
-	// 	for (int i = 0; i < spin; i++) Pause();
-	// }
-
-	// no contention try to acquire
-	if (internal_try_lock(this, state)) return;
+	for( int spin = 4; spin < 1024; spin += spin) {
+		state = 0;
+		// if unlocked, lock and return
+		if (internal_try_lock(this, state)) return;
+		if (2 == state) break;
+		for (int i = 0; i < spin; i++) Pause();
+	}
+
+	// // no contention try to acquire
+	// if (internal_try_lock(this, state)) return;
 	
 	// if not in contended state, set to be in contended state
@@ -209,9 +205,8 @@
 
 static inline void unlock(futex_mutex & this) with(this) {
-	// if uncontended do atomice unlock and then return
-	if (__atomic_fetch_sub(&val, 1, __ATOMIC_RELEASE) == 1) return; // TODO: try acq/rel
+	// if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
 	
 	// otherwise threads are blocked so we must wake one
-	__atomic_store_n((int *)&val, 0, __ATOMIC_RELEASE);
 	futex((int *)&val, FUTEX_WAKE, 1);
 }
@@ -222,4 +217,74 @@
 // to set recursion count after getting signalled;
 static inline void on_wakeup( futex_mutex & f, size_t recursion ) {}
+
+//-----------------------------------------------------------------------------
+// go_mutex
+
+// - Kernel thd blocking alternative to the spinlock
+// - No ownership (will deadlock on reacq)
+// - Golang's flavour of mutex
+// - Impl taken from Golang: src/runtime/lock_futex.go
+struct go_mutex {
+	// lock state any state other than UNLOCKED is locked
+	// enum LockState { UNLOCKED = 0, LOCKED = 1, SLEEPING = 2 };
+	
+	// stores a lock state
+	int val; 
+};
+
+static inline void  ?{}( go_mutex & this ) with(this) { val = 0; }
+
+static inline bool internal_try_lock(go_mutex & this, int & compare_val, int new_val ) with(this) {
+	return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, new_val, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
+
+static inline int internal_exchange(go_mutex & this, int swap ) with(this) {
+	return __atomic_exchange_n((int*)&val, swap, __ATOMIC_ACQUIRE);
+}
+
+const int __go_mtx_spins = 4;
+const int __go_mtx_pauses = 30;
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock(go_mutex & this) with(this) {
+	int state, init_state;
+
+    // speculative grab
+    state = internal_exchange(this, 1);
+    if ( !state ) return; // state == 0
+    init_state = state;
+    for (;;) {
+        for( int i = 0; i < __go_mtx_spins; i++ ) {
+            while( !val ) { // lock unlocked
+                state = 0;
+                if (internal_try_lock(this, state, init_state)) return;
+            }
+            for (int i = 0; i < __go_mtx_pauses; i++) Pause();
+        }
+
+        while( !val ) { // lock unlocked
+            state = 0;
+            if (internal_try_lock(this, state, init_state)) return;
+        }
+        sched_yield();
+        
+        // if not in contended state, set to be in contended state
+        state = internal_exchange(this, 2);
+        if ( !state ) return; // state == 0
+        init_state = 2;
+        futex((int*)&val, FUTEX_WAIT, 2); // if val is not 2 this returns with EWOULDBLOCK
+    }
+}
+
+static inline void unlock( go_mutex & this ) with(this) {
+	// if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
+	
+	// otherwise threads are blocked so we must wake one
+	futex((int *)&val, FUTEX_WAKE, 1);
+}
+
+static inline void on_notify( go_mutex & f, thread$ * t){ unpark(t); }
+static inline size_t on_wait( go_mutex & f ) {unlock(f); return 0;}
+static inline void on_wakeup( go_mutex & f, size_t recursion ) {}
 
 //-----------------------------------------------------------------------------
@@ -271,4 +336,6 @@
 	this.lock_value = 0;
 }
+
+static inline void  ^?{}( exp_backoff_then_block_lock & this ){}
 
 static inline bool internal_try_lock(exp_backoff_then_block_lock & this, size_t & compare_val) with(this) {
