From 667373485755ba23d37e5161da120f9f1912d138 Mon Sep 17 00:00:00 2001
From: Sean McBride <spmcbride1201@gmail.com>
Date: Sun, 2 Aug 2020 11:37:22 -0400
Subject: [PATCH] feat: mcs locks and profiling

---
 runtime/include/perf_window.h    | 21 ++++++++----
 runtime/include/priority_queue.h |  7 ++--
 runtime/include/worker_thread.h  |  2 ++
 runtime/src/priority_queue.c     | 57 +++++++++++++++++++++++---------
 runtime/src/worker_thread.c      | 24 ++++++++++++++
 5 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/runtime/include/perf_window.h b/runtime/include/perf_window.h
index c2f293c..15614a4 100644
--- a/runtime/include/perf_window.h
+++ b/runtime/include/perf_window.h
@@ -1,8 +1,11 @@
 #pragma once
 
-#include <spinlock/fas.h>
+#include <spinlock/mcs.h>
 #include <stdint.h>
 
+#include "runtime.h"
+#include "worker_thread.h"
+
 /* Should be Power of 2! */
 #define PERF_WINDOW_BUFFER_SIZE 16
 
@@ -13,7 +16,7 @@
 struct perf_window {
 	uint64_t          buffer[PERF_WINDOW_BUFFER_SIZE];
 	uint64_t          count;
-	ck_spinlock_fas_t lock;
+	ck_spinlock_mcs_t queue;
 	double            mean;
 };
 
@@ -26,7 +29,7 @@ static inline void
 perf_window_update_mean(struct perf_window *self)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 
 	uint64_t limit = self->count;
 	if (limit > PERF_WINDOW_BUFFER_SIZE) { limit = PERF_WINDOW_BUFFER_SIZE; }
@@ -47,7 +50,7 @@ perf_window_initialize(struct perf_window *self)
 {
 	assert(self != NULL);
 
-	ck_spinlock_fas_init(&self->lock);
+	ck_spinlock_mcs_init(&self->queue);
 	self->count = 0;
 	self->mean  = 0;
 	memset(&self->buffer, 0, sizeof(uint64_t) * PERF_WINDOW_BUFFER_SIZE);
@@ -64,14 +67,18 @@ perf_window_add(struct perf_window *self, uint64_t value)
 {
 	assert(self != NULL);
 
-
 	/* A successful invocation should run for a non-zero amount of time */
 	assert(value > 0);
 
-	ck_spinlock_fas_lock(&self->lock);
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
+
 	self->buffer[self->count++ % PERF_WINDOW_BUFFER_SIZE] = value;
 	perf_window_update_mean(self);
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 }
 
 /**
diff --git a/runtime/include/priority_queue.h b/runtime/include/priority_queue.h
index 534196a..a22f023 100644
--- a/runtime/include/priority_queue.h
+++ b/runtime/include/priority_queue.h
@@ -1,7 +1,10 @@
 #ifndef PRIORITY_QUEUE_H
 #define PRIORITY_QUEUE_H
 
-#include <spinlock/fas.h>
+#include <spinlock/mcs.h>
+
+#include "runtime.h"
+#include "worker_thread.h"
 
 #define MAX 4096
 
@@ -17,7 +20,7 @@ typedef uint64_t (*priority_queue_get_priority_fn_t)(void *element);
 
 /* We assume that priority is expressed in terms of a 64 bit unsigned integral */
 struct priority_queue {
-	ck_spinlock_fas_t                lock;
+	ck_spinlock_mcs_t                queue;
 	uint64_t                         highest_priority;
 	void *                           items[MAX];
 	int                              first_free;
diff --git a/runtime/include/worker_thread.h b/runtime/include/worker_thread.h
index e1f989c..b0a8711 100644
--- a/runtime/include/worker_thread.h
+++ b/runtime/include/worker_thread.h
@@ -8,6 +8,8 @@
 If there are fewer cores than this, main dynamically overrides this and uses all available */
 #define WORKER_THREAD_CORE_COUNT (NCORES > 1 ? NCORES - 1 : NCORES)
 
+extern __thread uint64_t  worker_thread_lock_duration;
+extern __thread uint64_t  worker_thread_start_timestamp;
 extern __thread uv_loop_t worker_thread_uvio_handle;
 
 void *worker_thread_main(void *return_code);
diff --git a/runtime/src/priority_queue.c b/runtime/src/priority_queue.c
index 6565a43..c5e07f1 100644
--- a/runtime/src/priority_queue.c
+++ b/runtime/src/priority_queue.c
@@ -22,7 +22,7 @@ static inline int
 priority_queue_append(struct priority_queue *self, void *new_item)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 
 	if (self->first_free >= MAX) return -ENOSPC;
 
@@ -39,7 +39,7 @@ priority_queue_percolate_up(struct priority_queue *self)
 {
 	assert(self != NULL);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 
 	for (int i = self->first_free - 1;
 	     i / 2 != 0 && self->get_priority_fn(self->items[i]) < self->get_priority_fn(self->items[i / 2]); i /= 2) {
@@ -64,7 +64,7 @@ priority_queue_find_smallest_child(struct priority_queue *self, int parent_index
 	assert(self != NULL);
 	assert(parent_index >= 1 && parent_index < self->first_free);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 
 	int left_child_index  = 2 * parent_index;
 	int right_child_index = 2 * parent_index + 1;
@@ -92,7 +92,7 @@ priority_queue_percolate_down(struct priority_queue *self, int parent_index)
 {
 	assert(self != NULL);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 
 	int left_child_index = 2 * parent_index;
 	while (left_child_index >= 2 && left_child_index < self->first_free) {
@@ -120,7 +120,7 @@ static inline bool
 priority_queue_is_empty_locked(struct priority_queue *self)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 	return self->first_free == 1;
 }
 
@@ -141,7 +141,7 @@ priority_queue_initialize(struct priority_queue *self, priority_queue_get_priori
 
 	memset(self->items, 0, sizeof(void *) * MAX);
 
-	ck_spinlock_fas_init(&self->lock);
+	ck_spinlock_mcs_init(&self->queue);
 	self->first_free      = 1;
 	self->get_priority_fn = get_priority_fn;
 
@@ -157,9 +157,15 @@ int
 priority_queue_length(struct priority_queue *self)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
+
 	int length = self->first_free - 1;
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 	return length;
 }
 
@@ -172,7 +178,11 @@ int
 priority_queue_enqueue(struct priority_queue *self, void *value)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
 
 	if (priority_queue_append(self, value) == -ENOSPC) return -ENOSPC;
 
@@ -182,7 +192,9 @@ priority_queue_enqueue(struct priority_queue *self, void *value)
 	} else {
 		priority_queue_percolate_up(self);
 	}
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
+
 	return 0;
 }
 /**
@@ -194,7 +206,11 @@ int
 priority_queue_delete(struct priority_queue *self, void *value)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
 
 	bool did_delete = false;
 	for (int i = 1; i < self->first_free; i++) {
@@ -206,7 +222,8 @@ priority_queue_delete(struct priority_queue *self, void *value)
 		}
 	}
 
-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
+
 	if (!did_delete) return -1;
 	return 0;
 }
@@ -225,10 +242,14 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element)
 
 	int return_code;
 
-	if (ck_spinlock_fas_trylock(&self->lock) == false) {
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) {
+		worker_thread_lock_duration += (__getcycles() - pre);
 		return_code = -EAGAIN;
 		goto done;
 	};
+	worker_thread_lock_duration += (__getcycles() - pre);
 
 	if (priority_queue_is_empty_locked(self)) {
 		return_code = -ENOENT;
@@ -250,7 +271,7 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element)
 	return_code = 0;
 
 release_lock:
-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 done:
 	return return_code;
 }
@@ -270,10 +291,14 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element)
 
 	int return_code;
 
-	if (ck_spinlock_fas_trylock(&self->lock) == false) {
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) {
+		worker_thread_lock_duration += (__getcycles() - pre);
 		return_code = -EAGAIN;
 		goto done;
 	};
+	worker_thread_lock_duration += (__getcycles() - pre);
 
 	if (priority_queue_is_empty_locked(self)) {
 		return_code = -ENOENT;
@@ -284,7 +309,7 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element)
 	return_code       = 0;
 
 release_lock:
-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 done:
 	return return_code;
 }
diff --git a/runtime/src/worker_thread.c b/runtime/src/worker_thread.c
index f05d3b2..ab61445 100644
--- a/runtime/src/worker_thread.c
+++ b/runtime/src/worker_thread.c
@@ -29,10 +29,29 @@ __thread uv_loop_t worker_thread_uvio_handle;
 /* Flag to signify if the thread is currently running callbacks in the libuv event loop */
 static __thread bool worker_thread_is_in_libuv_event_loop = false;
 
+/* Total Lock Contention in Cycles */
+__thread uint64_t worker_thread_lock_duration;
+
+/* Timestamp when worker thread began executing */
+__thread uint64_t worker_thread_start_timestamp;
+
 /***********************
  * Worker Thread Logic *
  **********************/
 
+/**
+ * Reports lock contention for the worker thread
+ */
+static inline void
+worker_thread_dump_lock_overhead()
+{
+#ifdef DEBUG
+	uint64_t worker_duration = __getcycles() - worker_thread_start_timestamp;
+	debuglog("Locks consumed %lu / %lu cycles, or %f%%\n", worker_thread_lock_duration, worker_duration,
+	         (double)worker_thread_lock_duration / worker_duration * 100);
+#endif
+}
+
 /**
  * Conditionally triggers appropriate state changes for exiting sandboxes
  * @param exiting_sandbox - The sandbox that ran to completion
@@ -228,6 +247,10 @@ worker_thread_execute_libuv_event_loop(void)
 void *
 worker_thread_main(void *return_code)
 {
+	/* Initialize Bookkeeping */
+	worker_thread_start_timestamp = __getcycles();
+	worker_thread_lock_duration   = 0;
+
 	/* Initialize Base Context */
 	arch_context_init(&worker_thread_base_context, 0, 0);
 
@@ -286,6 +309,7 @@ worker_thread_on_sandbox_exit(struct sandbox *exiting_sandbox)
 {
 	assert(exiting_sandbox);
 	software_interrupt_disable();
+	worker_thread_dump_lock_overhead();
 	worker_thread_switch_to_base_context();
 	assert(0);
 }