feat: mcs locks and profiling

4 years ago · 6673734857
parent c291b049c8
commit 6673734857
5 changed files with 86 additions and 25 deletions
--- a/runtime/include/perf_window.h
+++ b/runtime/include/perf_window.h
@ -1,8 +1,11 @@
 #pragma once

-#include <spinlock/fas.h>
+#include <spinlock/mcs.h>
 #include <stdint.h>

+#include "runtime.h"
+#include "worker_thread.h"
+
 /* Should be Power of 2! */
 #define PERF_WINDOW_BUFFER_SIZE 16

@ -13,7 +16,7 @@
 struct perf_window {
 	uint64_t          buffer[PERF_WINDOW_BUFFER_SIZE];
 	uint64_t          count;
-	ck_spinlock_fas_t lock;
+	ck_spinlock_mcs_t queue;
 	double            mean;
 };

@ -26,7 +29,7 @@ static inline void
 perf_window_update_mean(struct perf_window *self)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));

 	uint64_t limit = self->count;
 	if (limit > PERF_WINDOW_BUFFER_SIZE) { limit = PERF_WINDOW_BUFFER_SIZE; }
@ -47,7 +50,7 @@ perf_window_initialize(struct perf_window *self)
 {
 	assert(self != NULL);

-	ck_spinlock_fas_init(&self->lock);
+	ck_spinlock_mcs_init(&self->queue);
 	self->count = 0;
 	self->mean  = 0;
 	memset(&self->buffer, 0, sizeof(uint64_t) * PERF_WINDOW_BUFFER_SIZE);
@ -64,14 +67,18 @@ perf_window_add(struct perf_window *self, uint64_t value)
 {
 	assert(self != NULL);

-
 	/* A successful invocation should run for a non-zero amount of time */
 	assert(value > 0);

-	ck_spinlock_fas_lock(&self->lock);
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
+
 	self->buffer[self->count++ % PERF_WINDOW_BUFFER_SIZE] = value;
 	perf_window_update_mean(self);
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 }

 /**
--- a/runtime/include/priority_queue.h
+++ b/runtime/include/priority_queue.h
@ -1,7 +1,10 @@
 #ifndef PRIORITY_QUEUE_H
 #define PRIORITY_QUEUE_H

-#include <spinlock/fas.h>
+#include <spinlock/mcs.h>
+
+#include "runtime.h"
+#include "worker_thread.h"

 #define MAX 4096

@ -17,7 +20,7 @@ typedef uint64_t (*priority_queue_get_priority_fn_t)(void *element);

 /* We assume that priority is expressed in terms of a 64 bit unsigned integral */
 struct priority_queue {
-	ck_spinlock_fas_t                lock;
+	ck_spinlock_mcs_t                queue;
 	uint64_t                         highest_priority;
 	void *                           items[MAX];
 	int                              first_free;
--- a/runtime/include/worker_thread.h
+++ b/runtime/include/worker_thread.h
@ -8,6 +8,8 @@
 If there are fewer cores than this, main dynamically overrides this and uses all available */
 #define WORKER_THREAD_CORE_COUNT (NCORES > 1 ? NCORES - 1 : NCORES)

+extern __thread uint64_t  worker_thread_lock_duration;
+extern __thread uint64_t  worker_thread_start_timestamp;
 extern __thread uv_loop_t worker_thread_uvio_handle;

 void *worker_thread_main(void *return_code);
--- a/runtime/src/priority_queue.c
+++ b/runtime/src/priority_queue.c
@ -22,7 +22,7 @@ static inline int
 priority_queue_append(struct priority_queue *self, void *new_item)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));

 	if (self->first_free >= MAX) return -ENOSPC;

@ -39,7 +39,7 @@ priority_queue_percolate_up(struct priority_queue *self)
 {
 	assert(self != NULL);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));

 	for (int i = self->first_free - 1;
 	     i / 2 != 0 && self->get_priority_fn(self->items[i]) < self->get_priority_fn(self->items[i / 2]); i /= 2) {
@ -64,7 +64,7 @@ priority_queue_find_smallest_child(struct priority_queue *self, int parent_index
 	assert(self != NULL);
 	assert(parent_index >= 1 && parent_index < self->first_free);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));

 	int left_child_index  = 2 * parent_index;
 	int right_child_index = 2 * parent_index + 1;
@ -92,7 +92,7 @@ priority_queue_percolate_down(struct priority_queue *self, int parent_index)
 {
 	assert(self != NULL);
 	assert(self->get_priority_fn != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));

 	int left_child_index = 2 * parent_index;
 	while (left_child_index >= 2 && left_child_index < self->first_free) {
@ -120,7 +120,7 @@ static inline bool
 priority_queue_is_empty_locked(struct priority_queue *self)
 {
 	assert(self != NULL);
-	assert(ck_spinlock_fas_locked(&self->lock));
+	assert(ck_spinlock_mcs_locked(&self->queue));
 	return self->first_free == 1;
 }

@ -141,7 +141,7 @@ priority_queue_initialize(struct priority_queue *self, priority_queue_get_priori

 	memset(self->items, 0, sizeof(void *) * MAX);

-	ck_spinlock_fas_init(&self->lock);
+	ck_spinlock_mcs_init(&self->queue);
 	self->first_free      = 1;
 	self->get_priority_fn = get_priority_fn;

@ -157,9 +157,15 @@ int
 priority_queue_length(struct priority_queue *self)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);
+
 	int length = self->first_free - 1;
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 	return length;
 }

@ -172,7 +178,11 @@ int
 priority_queue_enqueue(struct priority_queue *self, void *value)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);

 	if (priority_queue_append(self, value) == -ENOSPC) return -ENOSPC;

@ -182,7 +192,9 @@ priority_queue_enqueue(struct priority_queue *self, void *value)
 	} else {
 		priority_queue_percolate_up(self);
 	}
-	ck_spinlock_fas_unlock(&self->lock);
+
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
+
 	return 0;
 }
 /**
@ -194,7 +206,11 @@ int
 priority_queue_delete(struct priority_queue *self, void *value)
 {
 	assert(self != NULL);
-	ck_spinlock_fas_lock(&self->lock);
+
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	ck_spinlock_mcs_lock(&self->queue, &lock);
+	worker_thread_lock_duration += (__getcycles() - pre);

 	bool did_delete = false;
 	for (int i = 1; i < self->first_free; i++) {
@ -206,7 +222,8 @@ priority_queue_delete(struct priority_queue *self, void *value)
 		}
 	}

-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
+
 	if (!did_delete) return -1;
 	return 0;
 }
@ -225,10 +242,14 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element)

 	int return_code;

-	if (ck_spinlock_fas_trylock(&self->lock) == false) {
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) {
+		worker_thread_lock_duration += (__getcycles() - pre);
 		return_code = -EAGAIN;
 		goto done;
 	};
+	worker_thread_lock_duration += (__getcycles() - pre);

 	if (priority_queue_is_empty_locked(self)) {
 		return_code = -ENOENT;
@ -250,7 +271,7 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element)
 	return_code = 0;

 release_lock:
-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 done:
 	return return_code;
 }
@ -270,10 +291,14 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element)

 	int return_code;

-	if (ck_spinlock_fas_trylock(&self->lock) == false) {
+	struct ck_spinlock_mcs lock;
+	uint64_t               pre = __getcycles();
+	if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) {
+		worker_thread_lock_duration += (__getcycles() - pre);
 		return_code = -EAGAIN;
 		goto done;
 	};
+	worker_thread_lock_duration += (__getcycles() - pre);

 	if (priority_queue_is_empty_locked(self)) {
 		return_code = -ENOENT;
@ -284,7 +309,7 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element)
 	return_code       = 0;

 release_lock:
-	ck_spinlock_fas_unlock(&self->lock);
+	ck_spinlock_mcs_unlock(&self->queue, &lock);
 done:
 	return return_code;
 }
--- a/runtime/src/worker_thread.c
+++ b/runtime/src/worker_thread.c
@ -29,10 +29,29 @@ __thread uv_loop_t worker_thread_uvio_handle;
 /* Flag to signify if the thread is currently running callbacks in the libuv event loop */
 static __thread bool worker_thread_is_in_libuv_event_loop = false;

+/* Total Lock Contention in Cycles */
+__thread uint64_t worker_thread_lock_duration;
+
+/* Timestamp when worker thread began executing */
+__thread uint64_t worker_thread_start_timestamp;
+
 /***********************
 * Worker Thread Logic *
 **********************/

+/**
+ * Reports lock contention for the worker thread
+ */
+static inline void
+worker_thread_dump_lock_overhead()
+{
+#ifdef DEBUG
+	uint64_t worker_duration = __getcycles() - worker_thread_start_timestamp;
+	debuglog("Locks consumed %lu / %lu cycles, or %f%%\n", worker_thread_lock_duration, worker_duration,
+	         (double)worker_thread_lock_duration / worker_duration * 100);
+#endif
+}
+
 /**
 * Conditionally triggers appropriate state changes for exiting sandboxes
 * @param exiting_sandbox - The sandbox that ran to completion
@ -228,6 +247,10 @@ worker_thread_execute_libuv_event_loop(void)
 void *
 worker_thread_main(void *return_code)
 {
+	/* Initialize Bookkeeping */
+	worker_thread_start_timestamp = __getcycles();
+	worker_thread_lock_duration   = 0;
+
 	/* Initialize Base Context */
 	arch_context_init(&worker_thread_base_context, 0, 0);

@ -286,6 +309,7 @@ worker_thread_on_sandbox_exit(struct sandbox *exiting_sandbox)
 {
 	assert(exiting_sandbox);
 	software_interrupt_disable();
+	worker_thread_dump_lock_overhead();
 	worker_thread_switch_to_base_context();
 	assert(0);
 }