#include <atomic>
#include <iostream>
#include <locale>
#include <thread>

#include <x86intrin.h>

struct __attribute__((aligned(128))) Global_t {
	volatile size_t value;
} global;

static const size_t iterations = 1'000'000'000;

size_t read() {
	// size_t r = __atomic_load_n(&global.value, __ATOMIC_RELAXED);
	// _mm_stream_si64((long long int*)&global.value, r);
	// // _mm_clflush( (void*)&global.value );
	// // __builtin_prefetch((void*)&global.value);
	// asm volatile(
	// 	"PREFETCHNTA %[target]"
	// 	:
	// 	: [target] "m" (global.value)
	// );
	// return r;
	return __atomic_load_n(&global.value, __ATOMIC_SEQ_CST);

	// __m128i r = _mm_stream_load_si128((__m128i*)&global.value);
	// asm volatile(
	// 	"PREFETCHNTA %[target]"
	// 	:
	// 	: [target] "m" (global.value)
	// );
	// return ((Global_t*)&r)->value;
	// size_t r;
	// asm volatile(
	// 	"MOVNTI %[target], %[r]\n\t"
	// 	: [r] "=r" (r)
	// 	: [target] "m" (global.value)
	// );
	// return r;
}

void write(size_t v) {
	// __atomic_store_n(&global.value, v, __ATOMIC_SEQ_CST);
	// __atomic_store_n(&global.value, v, __ATOMIC_RELAXED);
	// asm volatile(
	// 	"MOVNTI %[v], %[target]\n\t"
	// 	:
	// 	: [target] "m" (global.value), [v] "r" (v)
	// );
	_mm_stream_si64((long long int*)&global.value, v);
}

void reader(size_t * reads, size_t * diffs, size_t * m) {
	size_t last = read();
	for(size_t i = 0; i < iterations; i++) {
		size_t val = read();
		if(last != val) (*diffs)++;
		last = val;
		if(last > *m) *m = last;
		(*reads)++;
	}
}

std::atomic<bool> done = { false };

void writer() {
	size_t v = 0;
	while(!done) {
		v++;
		write(v);
		__atomic_thread_fence(__ATOMIC_SEQ_CST);
	}
}

int main() {
	std::cout.imbue(std::locale(""));
	size_t reads = 0;
	size_t diffs = 0;
	size_t max   = 0;
	auto w = std::thread(writer);
	auto r = std::thread(reader, &reads, &diffs, &max);
	r.join();
	done = true;
	w.join();
	std::cout << reads << " " << diffs << " " << max << std::endl;
}