From 667373485755ba23d37e5161da120f9f1912d138 Mon Sep 17 00:00:00 2001 From: Sean McBride Date: Sun, 2 Aug 2020 11:37:22 -0400 Subject: [PATCH] feat: mcs locks and profiling --- runtime/include/perf_window.h | 21 ++++++++---- runtime/include/priority_queue.h | 7 ++-- runtime/include/worker_thread.h | 2 ++ runtime/src/priority_queue.c | 57 +++++++++++++++++++++++--------- runtime/src/worker_thread.c | 24 ++++++++++++++ 5 files changed, 86 insertions(+), 25 deletions(-) diff --git a/runtime/include/perf_window.h b/runtime/include/perf_window.h index c2f293c..15614a4 100644 --- a/runtime/include/perf_window.h +++ b/runtime/include/perf_window.h @@ -1,8 +1,11 @@ #pragma once -#include +#include #include +#include "runtime.h" +#include "worker_thread.h" + /* Should be Power of 2! */ #define PERF_WINDOW_BUFFER_SIZE 16 @@ -13,7 +16,7 @@ struct perf_window { uint64_t buffer[PERF_WINDOW_BUFFER_SIZE]; uint64_t count; - ck_spinlock_fas_t lock; + ck_spinlock_mcs_t queue; double mean; }; @@ -26,7 +29,7 @@ static inline void perf_window_update_mean(struct perf_window *self) { assert(self != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); uint64_t limit = self->count; if (limit > PERF_WINDOW_BUFFER_SIZE) { limit = PERF_WINDOW_BUFFER_SIZE; } @@ -47,7 +50,7 @@ perf_window_initialize(struct perf_window *self) { assert(self != NULL); - ck_spinlock_fas_init(&self->lock); + ck_spinlock_mcs_init(&self->queue); self->count = 0; self->mean = 0; memset(&self->buffer, 0, sizeof(uint64_t) * PERF_WINDOW_BUFFER_SIZE); @@ -64,14 +67,18 @@ perf_window_add(struct perf_window *self, uint64_t value) { assert(self != NULL); - /* A successful invocation should run for a non-zero amount of time */ assert(value > 0); - ck_spinlock_fas_lock(&self->lock); + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + ck_spinlock_mcs_lock(&self->queue, &lock); + worker_thread_lock_duration += (__getcycles() - pre); + self->buffer[self->count++ % PERF_WINDOW_BUFFER_SIZE] = value; perf_window_update_mean(self); - ck_spinlock_fas_unlock(&self->lock); + + ck_spinlock_mcs_unlock(&self->queue, &lock); } /** diff --git a/runtime/include/priority_queue.h b/runtime/include/priority_queue.h index 534196a..a22f023 100644 --- a/runtime/include/priority_queue.h +++ b/runtime/include/priority_queue.h @@ -1,7 +1,10 @@ #ifndef PRIORITY_QUEUE_H #define PRIORITY_QUEUE_H -#include +#include + +#include "runtime.h" +#include "worker_thread.h" #define MAX 4096 @@ -17,7 +20,7 @@ typedef uint64_t (*priority_queue_get_priority_fn_t)(void *element); /* We assume that priority is expressed in terms of a 64 bit unsigned integral */ struct priority_queue { - ck_spinlock_fas_t lock; + ck_spinlock_mcs_t queue; uint64_t highest_priority; void * items[MAX]; int first_free; diff --git a/runtime/include/worker_thread.h b/runtime/include/worker_thread.h index e1f989c..b0a8711 100644 --- a/runtime/include/worker_thread.h +++ b/runtime/include/worker_thread.h @@ -8,6 +8,8 @@ If there are fewer cores than this, main dynamically overrides this and uses all available */ #define WORKER_THREAD_CORE_COUNT (NCORES > 1 ? NCORES - 1 : NCORES) +extern __thread uint64_t worker_thread_lock_duration; +extern __thread uint64_t worker_thread_start_timestamp; extern __thread uv_loop_t worker_thread_uvio_handle; void *worker_thread_main(void *return_code); diff --git a/runtime/src/priority_queue.c b/runtime/src/priority_queue.c index 6565a43..c5e07f1 100644 --- a/runtime/src/priority_queue.c +++ b/runtime/src/priority_queue.c @@ -22,7 +22,7 @@ static inline int priority_queue_append(struct priority_queue *self, void *new_item) { assert(self != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); if (self->first_free >= MAX) return -ENOSPC; @@ -39,7 +39,7 @@ priority_queue_percolate_up(struct priority_queue *self) { assert(self != NULL); assert(self->get_priority_fn != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); for (int i = self->first_free - 1; i / 2 != 0 && self->get_priority_fn(self->items[i]) < self->get_priority_fn(self->items[i / 2]); i /= 2) { @@ -64,7 +64,7 @@ priority_queue_find_smallest_child(struct priority_queue *self, int parent_index assert(self != NULL); assert(parent_index >= 1 && parent_index < self->first_free); assert(self->get_priority_fn != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); int left_child_index = 2 * parent_index; int right_child_index = 2 * parent_index + 1; @@ -92,7 +92,7 @@ priority_queue_percolate_down(struct priority_queue *self, int parent_index) { assert(self != NULL); assert(self->get_priority_fn != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); int left_child_index = 2 * parent_index; while (left_child_index >= 2 && left_child_index < self->first_free) { @@ -120,7 +120,7 @@ static inline bool priority_queue_is_empty_locked(struct priority_queue *self) { assert(self != NULL); - assert(ck_spinlock_fas_locked(&self->lock)); + assert(ck_spinlock_mcs_locked(&self->queue)); return self->first_free == 1; } @@ -141,7 +141,7 @@ priority_queue_initialize(struct priority_queue *self, priority_queue_get_priori memset(self->items, 0, sizeof(void *) * MAX); - ck_spinlock_fas_init(&self->lock); + ck_spinlock_mcs_init(&self->queue); self->first_free = 1; self->get_priority_fn = get_priority_fn; @@ -157,9 +157,15 @@ int priority_queue_length(struct priority_queue *self) { assert(self != NULL); - ck_spinlock_fas_lock(&self->lock); + + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + ck_spinlock_mcs_lock(&self->queue, &lock); + worker_thread_lock_duration += (__getcycles() - pre); + int length = self->first_free - 1; - ck_spinlock_fas_unlock(&self->lock); + + ck_spinlock_mcs_unlock(&self->queue, &lock); return length; } @@ -172,7 +178,11 @@ int priority_queue_enqueue(struct priority_queue *self, void *value) { assert(self != NULL); - ck_spinlock_fas_lock(&self->lock); + + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + ck_spinlock_mcs_lock(&self->queue, &lock); + worker_thread_lock_duration += (__getcycles() - pre); if (priority_queue_append(self, value) == -ENOSPC) return -ENOSPC; @@ -182,7 +192,9 @@ priority_queue_enqueue(struct priority_queue *self, void *value) } else { priority_queue_percolate_up(self); } - ck_spinlock_fas_unlock(&self->lock); + + ck_spinlock_mcs_unlock(&self->queue, &lock); + return 0; } /** @@ -194,7 +206,11 @@ int priority_queue_delete(struct priority_queue *self, void *value) { assert(self != NULL); - ck_spinlock_fas_lock(&self->lock); + + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + ck_spinlock_mcs_lock(&self->queue, &lock); + worker_thread_lock_duration += (__getcycles() - pre); bool did_delete = false; for (int i = 1; i < self->first_free; i++) { @@ -206,7 +222,8 @@ priority_queue_delete(struct priority_queue *self, void *value) } } - ck_spinlock_fas_unlock(&self->lock); + ck_spinlock_mcs_unlock(&self->queue, &lock); + if (!did_delete) return -1; return 0; } @@ -225,10 +242,14 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element) int return_code; - if (ck_spinlock_fas_trylock(&self->lock) == false) { + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) { + worker_thread_lock_duration += (__getcycles() - pre); return_code = -EAGAIN; goto done; }; + worker_thread_lock_duration += (__getcycles() - pre); if (priority_queue_is_empty_locked(self)) { return_code = -ENOENT; @@ -250,7 +271,7 @@ priority_queue_dequeue(struct priority_queue *self, void **dequeued_element) return_code = 0; release_lock: - ck_spinlock_fas_unlock(&self->lock); + ck_spinlock_mcs_unlock(&self->queue, &lock); done: return return_code; } @@ -270,10 +291,14 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element) int return_code; - if (ck_spinlock_fas_trylock(&self->lock) == false) { + struct ck_spinlock_mcs lock; + uint64_t pre = __getcycles(); + if (ck_spinlock_mcs_trylock(&self->queue, &lock) == false) { + worker_thread_lock_duration += (__getcycles() - pre); return_code = -EAGAIN; goto done; }; + worker_thread_lock_duration += (__getcycles() - pre); if (priority_queue_is_empty_locked(self)) { return_code = -ENOENT; @@ -284,7 +309,7 @@ priority_queue_top(struct priority_queue *self, void **dequeued_element) return_code = 0; release_lock: - ck_spinlock_fas_unlock(&self->lock); + ck_spinlock_mcs_unlock(&self->queue, &lock); done: return return_code; } diff --git a/runtime/src/worker_thread.c b/runtime/src/worker_thread.c index f05d3b2..ab61445 100644 --- a/runtime/src/worker_thread.c +++ b/runtime/src/worker_thread.c @@ -29,10 +29,29 @@ __thread uv_loop_t worker_thread_uvio_handle; /* Flag to signify if the thread is currently running callbacks in the libuv event loop */ static __thread bool worker_thread_is_in_libuv_event_loop = false; +/* Total Lock Contention in Cycles */ +__thread uint64_t worker_thread_lock_duration; + +/* Timestamp when worker thread began executing */ +__thread uint64_t worker_thread_start_timestamp; + /*********************** * Worker Thread Logic * **********************/ +/** + * Reports lock contention for the worker thread + */ +static inline void +worker_thread_dump_lock_overhead() +{ +#ifdef DEBUG + uint64_t worker_duration = __getcycles() - worker_thread_start_timestamp; + debuglog("Locks consumed %lu / %lu cycles, or %f%%\n", worker_thread_lock_duration, worker_duration, + (double)worker_thread_lock_duration / worker_duration * 100); +#endif +} + /** * Conditionally triggers appropriate state changes for exiting sandboxes * @param exiting_sandbox - The sandbox that ran to completion @@ -228,6 +247,10 @@ worker_thread_execute_libuv_event_loop(void) void * worker_thread_main(void *return_code) { + /* Initialize Bookkeeping */ + worker_thread_start_timestamp = __getcycles(); + worker_thread_lock_duration = 0; + /* Initialize Base Context */ arch_context_init(&worker_thread_base_context, 0, 0); @@ -286,6 +309,7 @@ worker_thread_on_sandbox_exit(struct sandbox *exiting_sandbox) { assert(exiting_sandbox); software_interrupt_disable(); + worker_thread_dump_lock_overhead(); worker_thread_switch_to_base_context(); assert(0); }