#!/bin/bash
###SHELLPACK preamble thotdata 0

###SHELLPACK parseargBegin
###SHELLPACK parseargEnd

###SHELLPACK self_extract thotdata.c 

mkdir $SHELLPACK_SOURCES/thotdata-${VERSION}-installed
gcc -Wall -lpthread $SHELLPACK_TEMP/thotdata.c -o $SHELLPACK_SOURCES/thotdata-${VERSION}-installed/thotdata || \
	die Failed to build thotdata

echo thotdata installed successfully
exit $SHELLPACK_SUCCESS

==== BEGIN thotdata.c ====
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>

#define HPAGE_SIZE (2*1048576)
#define REPORT_THRESHOLD 1048576

struct access_timing {
	uint64_t timestamp;
	uint64_t latency;
};

static pthread_barrier_t init_barrier;
pthread_mutex_t report_lock;
uint64_t *shared_space;
uint64_t born;
uint64_t min_latency;
pthread_t *threads;
int nr_threads;
int nr_samples;
int alignment;
int tid_offset = 0;
bool done;

static inline uint64_t timespec_to_ns(struct timespec *tv)
{
	return ((uint64_t)tv->tv_sec * 1000000000) + tv->tv_nsec;
}

static uint64_t *tid_to_thread_space(int tid)
{
	return shared_space + (tid * nr_samples);
}

static int thread_space_to_tid(uint64_t *thread_space)
{
	return (thread_space - shared_space) / nr_samples;
}

static void dump_latencies(int tid, struct access_timing *latencies)
{
	int i;

	pthread_mutex_lock(&report_lock);
	for (i = 0; i < REPORT_THRESHOLD; i++) {
		if (!latencies[i].latency)
			break;
		printf("%3d %16f %16f\n",
			tid,
			(double)latencies[i].timestamp / 1000000,
			(double)latencies[i].latency / 1000000);
	}
	pthread_mutex_unlock(&report_lock);

	memset(latencies, 0, REPORT_THRESHOLD * sizeof(struct access_timing));
}

static unsigned int record_sample(int tid, uint64_t timestamp, uint64_t latency, struct access_timing *latencies, int index)
{
	latencies[index].timestamp = timestamp;
	if (latency >= min_latency) {
		latencies[index].latency = latency;
		if (++index == REPORT_THRESHOLD) {
			dump_latencies(tid, latencies);
			index = 0;
		}
	}

	return index;
}

static void *worker(void *data)
{
	uint64_t *thread_space = (uint64_t *) data;
	struct access_timing *latencies;
	int tid, cpu;
	unsigned int index = 0;
	uint64_t start = 0, end = 0;
	struct timespec tv_start, tv_end;
	FILE *fd;

	/* Initialise work space */
	tid = thread_space_to_tid(thread_space);
	latencies = calloc(REPORT_THRESHOLD, sizeof(struct access_timing));
	if (!latencies) {
		fprintf(stderr, "ERROR: Thread %d failed to allocate space for latencies\n", tid);
		pthread_barrier_wait(&init_barrier);
		return NULL;
	}

	memset(thread_space, 0, nr_samples * sizeof(uint64_t));
	pthread_barrier_wait(&init_barrier);

	while (!done) {
		int i;
		thread_space = tid_to_thread_space((tid + tid_offset) % nr_threads);
		clock_gettime(CLOCK_REALTIME, &tv_start);
		for (i = 0; i < nr_samples; i++)
			thread_space[i] += start;
		clock_gettime(CLOCK_REALTIME, &tv_end);
		start = timespec_to_ns(&tv_start);
		end = timespec_to_ns(&tv_end);
		index = record_sample(tid, end - born, end - start, latencies, index);
	}

	dump_latencies(tid, latencies);

	/* Make it impossible to optimise thread_space writes */
	fd = fopen("/tmp/thotdata", "w");
	fwrite(thread_space,  sizeof(uint64_t), 1, fd);
	fclose(fd);
	unlink("/tmp/thotdata");

	return &thread_space[0];
}

int main(int argc, char **argv)
{
	int i;
	size_t shared_size;
	struct timespec tv_start;

	if (argc <= 3) {
		printf("Usage: numab-hot-test alignment min_latency cpulist\n");
		exit(EXIT_FAILURE);
	}

	alignment = atoi(argv[1]);
	min_latency = atoi(argv[2]);
	nr_threads = argc - 3;
	nr_samples = alignment / sizeof(uint64_t);

	shared_size = nr_samples * nr_threads * sizeof(uint64_t);
	if (posix_memalign((void **)&shared_space, HPAGE_SIZE, shared_size)) {
		perror("posix_memalign");
		exit(EXIT_FAILURE);
	}

	threads = malloc(nr_threads * sizeof(struct pthread_t *));
	if (!threads) {
		printf("FATAL: Insufficient memory to allocate thread structures\n");
		exit(EXIT_FAILURE);
	}

	done = false;
	pthread_barrier_init(&init_barrier, NULL, nr_threads);
	if (pthread_mutex_init(&report_lock, NULL) != 0) {
        	fprintf(stderr, "ERROR: Failed to init mutex\n");
		exit(EXIT_FAILURE);
	}

	clock_gettime(CLOCK_REALTIME, &tv_start);
	born = timespec_to_ns(&tv_start);

	for (i = 3; i < argc; i++) {
		uint64_t *thread_space = tid_to_thread_space(i - 3);
		int cpu = atoi(argv[i]);

		thread_space[0] = cpu;
		if (pthread_create(&threads[i - 3], NULL, worker, thread_space)) {
			perror("Creating thread");
			exit(EXIT_FAILURE);
		}
	}

	for (i = 0; i < 120; i++) {
		sleep(1);
		tid_offset = (tid_offset + 1) % nr_threads;
	}
	done = true;

	for (i = 0; i < nr_threads; i++)
		pthread_join(threads[i], NULL);
	pthread_barrier_destroy(&init_barrier);

	exit(EXIT_SUCCESS);
}
==== END thotdata.c ====
