You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

484 lines
16 KiB

#pragma once
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include "current_sandbox.h"
#include "global_request_scheduler.h"
#include "global_request_scheduler_deque.h"
#include "global_request_scheduler_minheap.h"
#include "global_request_scheduler_mtds.h"
#include "local_runqueue.h"
#include "local_runqueue_minheap.h"
#include "local_runqueue_list.h"
#include "local_cleanup_queue.h"
#include "local_runqueue_mtds.h"
#include "panic.h"
#include "sandbox_functions.h"
#include "sandbox_types.h"
#include "sandbox_set_as_preempted.h"
#include "sandbox_set_as_runnable.h"
#include "sandbox_set_as_running_sys.h"
#include "sandbox_set_as_interrupted.h"
#include "sandbox_set_as_running_user.h"
#include "scheduler_execute_epoll_loop.h"
#include "scheduler_options.h"
/**
* This scheduler provides for cooperative and preemptive multitasking in a OS process's userspace.
*
* When executing cooperatively, the scheduler is directly invoked via `scheduler_cooperative_sched`. It runs a single
* time in the existing context in order to try to execute a direct sandbox-to-sandbox switch. When no sandboxes are
* available to execute, the scheduler executes a context switch to `worker_thread_base_context`, which calls
* `scheduler_cooperative_sched` in an infinite idle loop. If the scheduler needs to restore a sandbox that was
* previously preempted, it raises a SIGUSR1 signal to enter the scheduler handler to be able to restore the full
* mcontext structure saved during the last preemption. Otherwise, the cooperative scheduler triggers a "fast switch",
* which only updates the instruction and stack pointer.
*
* Preemptive scheduler is provided by POSIX timers using a set interval defining a scheduling quantum. Our signal
* handler is configured to mask nested signals. Given that POSIX specifies that the kernel only delivers a SIGALRM to a
* single thread, the lucky thread that receives the kernel thread has the responsibility of propagating this signal
* onto all other worker threads. This must occur even when a worker thread is running a sandbox in a nonpreemptable
* state.
*
* When a SIGALRM fires, a worker can be in one of four states:
*
* 1) "Running a signal handler" - We mask signals when we are executing a signal handler, which results in signals
* being ignored. A kernel signal should get delivered to another unmasked worker, so propagation still occurs.
*
* 2) "Running the Cooperative Scheduler" - This is signified by the thread local current_sandbox being set to NULL. We
* propagate the signal and return immediately because we know we're already in the scheduler. We have no sandboxes to
* interrupt, so no sandbox state transitions occur.
*
* 3) "Running a Sandbox in a state other than SANDBOX_RUNNING_USER" - We call sandbox_interrupt on current_sandbox,
* propagate the sigalrms to the other workers, defer the sigalrm locally, and then return. The SANDBOX_INTERRUPTED
* timekeeping data is increased to account for the time needed to propagate the sigalrms.
*
* 4) "Running a Sandbox in the SANDBOX_RUNNING_USER state - We call sandbox_interrupt on current_sandbox, propagate
* the sigalrms to the other workers, and then actually enter the scheduler via scheduler_preemptive_sched. The
* interrupted sandbox may either be preempted or return to depending on the scheduler. If preempted, the interrupted
* mcontext is saved to the sandbox structure. The SANDBOX_INTERRUPTED timekeeping data is increased to account for the
* time needed to propagate the sigalrms, run epoll, query the scheduler data structure, and (potentially) allocate and
* initialize a sandbox.
*/
static inline struct sandbox *
scheduler_mtdbf_get_next()
{
return NULL;
}
static inline struct sandbox *
scheduler_mtds_get_next()
{
/* Get the deadline of the sandbox at the head of the local queue */
struct sandbox *local = local_runqueue_get_next();
uint64_t local_deadline = local == NULL ? UINT64_MAX : local->absolute_deadline;
enum MULTI_TENANCY_CLASS local_mt_class = MT_DEFAULT;
struct sandbox *global = NULL;
if (local) local_mt_class = local->tenant->pwt_sandboxes[worker_thread_idx].mt_class;
uint64_t global_guaranteed_deadline = global_request_scheduler_mtds_guaranteed_peek();
uint64_t global_default_deadline = global_request_scheduler_mtds_default_peek();
/* Try to pull and allocate from the global queue if earlier
* This will be placed at the head of the local runqueue */
switch (local_mt_class) {
case MT_GUARANTEED:
if (global_guaranteed_deadline >= local_deadline) goto done;
break;
case MT_DEFAULT:
if (global_guaranteed_deadline == UINT64_MAX && global_default_deadline >= local_deadline) goto done;
break;
}
if (global_request_scheduler_mtds_remove_with_mt_class(&global, local_deadline, local_mt_class) == 0) {
assert(global != NULL);
sandbox_prepare_execution_environment(global);
assert(global->state == SANDBOX_INITIALIZED);
sandbox_set_as_runnable(global, SANDBOX_INITIALIZED);
}
/* Return what is at the head of the local runqueue or NULL if empty */
done:
return local_runqueue_get_next();
}
static inline struct sandbox *
scheduler_edf_get_next()
{
/* Get the deadline of the sandbox at the head of the local queue */
struct sandbox *local = local_runqueue_get_next();
uint64_t local_deadline = local == NULL ? UINT64_MAX : local->absolute_deadline;
struct sandbox *global = NULL;
uint64_t global_deadline = global_request_scheduler_peek();
/* Try to pull and allocate from the global queue if earlier
* This will be placed at the head of the local runqueue */
if (global_deadline < local_deadline) {
if (global_request_scheduler_remove_if_earlier(&global, local_deadline) == 0) {
assert(global != NULL);
assert(global->absolute_deadline < local_deadline);
sandbox_prepare_execution_environment(global);
assert(global->state == SANDBOX_INITIALIZED);
sandbox_set_as_runnable(global, SANDBOX_INITIALIZED);
}
}
/* Return what is at the head of the local runqueue or NULL if empty */
return local_runqueue_get_next();
}
static inline struct sandbox *
scheduler_fifo_get_next()
{
struct sandbox *local = local_runqueue_get_next();
struct sandbox *global = NULL;
if (local == NULL) {
/* If the local runqueue is empty, pull from global request scheduler */
if (global_request_scheduler_remove(&global) < 0) goto done;
sandbox_prepare_execution_environment(global);
sandbox_set_as_runnable(global, SANDBOX_INITIALIZED);
} else if (local == current_sandbox_get()) {
/* Execute Round Robin Scheduling Logic if the head is the current sandbox */
local_runqueue_list_rotate();
}
done:
return local_runqueue_get_next();
}
static inline struct sandbox *
scheduler_get_next()
{
switch (scheduler) {
case SCHEDULER_MTDBF:
return scheduler_mtdbf_get_next();
case SCHEDULER_MTDS:
return scheduler_mtds_get_next();
case SCHEDULER_EDF:
return scheduler_edf_get_next();
case SCHEDULER_FIFO:
return scheduler_fifo_get_next();
default:
panic("Unimplemented\n");
}
}
static inline void
scheduler_initialize()
{
switch (scheduler) {
case SCHEDULER_MTDBF:
// global_request_scheduler_mtdbf_initialize();
break;
case SCHEDULER_MTDS:
global_request_scheduler_mtds_initialize();
break;
case SCHEDULER_EDF:
global_request_scheduler_minheap_initialize();
break;
case SCHEDULER_FIFO:
global_request_scheduler_deque_initialize();
break;
default:
panic("Invalid scheduler policy: %u\n", scheduler);
}
}
static inline void
scheduler_runqueue_initialize()
{
switch (scheduler) {
case SCHEDULER_MTDBF:
// local_runqueue_mtdbf_initialize();
break;
case SCHEDULER_MTDS:
local_runqueue_mtds_initialize();
break;
case SCHEDULER_EDF:
local_runqueue_minheap_initialize();
break;
case SCHEDULER_FIFO:
local_runqueue_list_initialize();
break;
default:
panic("Invalid scheduler policy: %u\n", scheduler);
}
}
static inline char *
scheduler_print(enum SCHEDULER variant)
{
switch (variant) {
case SCHEDULER_FIFO:
return "FIFO";
case SCHEDULER_EDF:
return "EDF";
case SCHEDULER_MTDS:
return "MTDS";
case SCHEDULER_MTDBF:
return "MTDBF";
}
}
static inline void
scheduler_log_sandbox_switch(struct sandbox *current_sandbox, struct sandbox *next_sandbox)
{
#ifdef LOG_CONTEXT_SWITCHES
if (current_sandbox == NULL) {
/* Switching from "Base Context" */
debuglog("Base Context (@%p) (%s) > Sandbox %lu (@%p) (%s)\n", &worker_thread_base_context,
arch_context_variant_print(worker_thread_base_context.variant), next_sandbox->id,
&next_sandbox->ctxt, arch_context_variant_print(next_sandbox->ctxt.variant));
} else if (next_sandbox == NULL) {
debuglog("Sandbox %lu (@%p) (%s) > Base Context (@%p) (%s)\n", current_sandbox->id,
&current_sandbox->ctxt, arch_context_variant_print(current_sandbox->ctxt.variant),
&worker_thread_base_context, arch_context_variant_print(worker_thread_base_context.variant));
} else {
debuglog("Sandbox %lu (@%p) (%s) > Sandbox %lu (@%p) (%s)\n", current_sandbox->id,
&current_sandbox->ctxt, arch_context_variant_print(current_sandbox->ctxt.variant),
next_sandbox->id, &next_sandbox->ctxt, arch_context_variant_print(next_sandbox->ctxt.variant));
}
#endif
}
static inline void
scheduler_preemptive_switch_to(ucontext_t *interrupted_context, struct sandbox *next)
{
/* Switch to next sandbox */
switch (next->ctxt.variant) {
case ARCH_CONTEXT_VARIANT_FAST: {
assert(next->state == SANDBOX_RUNNABLE);
arch_context_restore_fast(&interrupted_context->uc_mcontext, &next->ctxt);
current_sandbox_set(next);
sandbox_set_as_running_sys(next, SANDBOX_RUNNABLE);
break;
}
case ARCH_CONTEXT_VARIANT_SLOW: {
assert(next->state == SANDBOX_PREEMPTED);
arch_context_restore_slow(&interrupted_context->uc_mcontext, &next->ctxt);
current_sandbox_set(next);
sandbox_set_as_running_user(next, SANDBOX_PREEMPTED);
break;
}
default: {
panic("Unexpectedly tried to switch to a context in %s state\n",
arch_context_variant_print(next->ctxt.variant));
}
}
}
/**
* Call either at preemptions or blockings to update the scheduler-specific
* properties for the given tenant.
*/
static inline void
scheduler_process_policy_specific_updates_on_interrupts(struct sandbox *interrupted_sandbox)
{
switch (scheduler) {
case SCHEDULER_FIFO:
return;
case SCHEDULER_EDF:
return;
case SCHEDULER_MTDS:
local_timeout_queue_process_promotions();
return;
case SCHEDULER_MTDBF:
// scheduler_check_messages_from_listener();
if (interrupted_sandbox->state != SANDBOX_ERROR) {
sandbox_process_scheduler_updates(interrupted_sandbox);
}
return;
}
}
/**
* Called by the SIGALRM handler after a quantum
* Assumes the caller validates that there is something to preempt
* @param interrupted_context - The context of our user-level Worker thread
* @returns the sandbox that the scheduler chose to run
*/
static inline void
scheduler_preemptive_sched(ucontext_t *interrupted_context)
{
assert(interrupted_context != NULL);
/* Process epoll to make sure that all runnable jobs are considered for execution */
scheduler_execute_epoll_loop();
struct sandbox *interrupted_sandbox = current_sandbox_get();
assert(interrupted_sandbox != NULL);
assert(interrupted_sandbox->state == SANDBOX_INTERRUPTED);
// printf ("Worker #%d interrupted sandbox #%lu\n", worker_thread_idx, interrupted_sandbox->id);
scheduler_process_policy_specific_updates_on_interrupts(interrupted_sandbox);
struct sandbox *next = scheduler_get_next();
/* Assumption: the current sandbox is on the runqueue, so the scheduler should always return something */
assert(next != NULL);
/* If current equals next, no switch is necessary, so resume execution */
if (interrupted_sandbox == next) {
sandbox_interrupt_return(interrupted_sandbox, SANDBOX_RUNNING_USER);
return;
}
#ifdef LOG_PREEMPTION
debuglog("Preempting sandbox %lu to run sandbox %lu\n", interrupted_sandbox->id, next->id);
#endif
/* Preempt executing sandbox */
scheduler_log_sandbox_switch(interrupted_sandbox, next);
sandbox_preempt(interrupted_sandbox);
// Write back global at idx 0
wasm_globals_set_i64(&interrupted_sandbox->globals, 0, sledge_abi__current_wasm_module_instance.abi.wasmg_0,
true);
arch_context_save_slow(&interrupted_sandbox->ctxt, &interrupted_context->uc_mcontext);
scheduler_preemptive_switch_to(interrupted_context, next);
}
/**
* @brief Switches to the next sandbox
* Assumption: only called by the "base context"
* @param next_sandbox The Sandbox to switch to
*/
static inline void
scheduler_cooperative_switch_to(struct arch_context *current_context, struct sandbox *next_sandbox)
{
assert(current_sandbox_get() == NULL);
struct arch_context *next_context = &next_sandbox->ctxt;
/* Switch to next sandbox */
switch (next_sandbox->state) {
case SANDBOX_RUNNABLE: {
assert(next_context->variant == ARCH_CONTEXT_VARIANT_FAST);
current_sandbox_set(next_sandbox);
sandbox_set_as_running_sys(next_sandbox, SANDBOX_RUNNABLE);
break;
}
case SANDBOX_PREEMPTED: {
assert(next_context->variant == ARCH_CONTEXT_VARIANT_SLOW);
current_sandbox_set(next_sandbox);
/* arch_context_switch triggers a SIGUSR1, which transitions next_sandbox to running_user */
break;
}
default: {
panic("Unexpectedly tried to switch to a sandbox in %s state\n",
sandbox_state_stringify(next_sandbox->state));
}
}
arch_context_switch(current_context, next_context);
}
static inline void
scheduler_switch_to_base_context(struct arch_context *current_context)
{
/* Assumption: Base Worker context should never be preempted */
assert(worker_thread_base_context.variant == ARCH_CONTEXT_VARIANT_FAST);
arch_context_switch(current_context, &worker_thread_base_context);
}
/* The idle_loop is executed by the base_context. This should not be called directly */
static inline void
scheduler_idle_loop()
{
while (true) {
/* Assumption: only called by the "base context" */
assert(current_sandbox_get() == NULL);
/* Deferred signals should have been cleared by this point */
assert(deferred_sigalrm == 0);
/* Try to wakeup sleeping sandboxes */
scheduler_execute_epoll_loop();
/* Switch to a sandbox if one is ready to run */
struct sandbox *next_sandbox = scheduler_get_next();
if (next_sandbox != NULL) {
scheduler_cooperative_switch_to(&worker_thread_base_context, next_sandbox);
}
/* Clear the cleanup queue */
local_cleanup_queue_free();
}
}
/**
* @brief Used to cooperative switch sandboxes when a sandbox sleeps or exits
* Because of use-after-free bugs that interfere with our loggers, when a sandbox exits and switches away never to
* return, the boolean add_to_cleanup_queue needs to be set to true. Otherwise, we will leak sandboxes.
* @param add_to_cleanup_queue - Indicates that the sandbox should be added to the cleanup queue before switching
* away
*/
static inline void
scheduler_cooperative_sched(bool add_to_cleanup_queue)
{
struct sandbox *exiting_sandbox = current_sandbox_get();
assert(exiting_sandbox != NULL);
/* Clearing current sandbox indicates we are entering the cooperative scheduler */
current_sandbox_set(NULL);
barrier();
software_interrupt_deferred_sigalrm_clear();
struct arch_context *exiting_context = &exiting_sandbox->ctxt;
/* Assumption: Called by an exiting or sleeping sandbox */
assert(current_sandbox_get() == NULL);
/* Deferred signals should have been cleared by this point */
assert(deferred_sigalrm == 0);
/* Try to wakeup sleeping sandboxes */
scheduler_execute_epoll_loop();
/* We have not added ourself to the cleanup queue, so we can free */
local_cleanup_queue_free();
/* Switch to a sandbox if one is ready to run */
struct sandbox *next_sandbox = scheduler_get_next();
/* If our sandbox slept and immediately woke up, we can just return */
if (next_sandbox == exiting_sandbox) {
sandbox_set_as_running_sys(next_sandbox, SANDBOX_RUNNABLE);
current_sandbox_set(next_sandbox);
return;
}
scheduler_log_sandbox_switch(exiting_sandbox, next_sandbox);
// Write back global at idx 0
wasm_globals_set_i64(&exiting_sandbox->globals, 0, sledge_abi__current_wasm_module_instance.abi.wasmg_0, true);
if (add_to_cleanup_queue) local_cleanup_queue_add(exiting_sandbox);
/* Do not touch sandbox struct after this point! */
if (next_sandbox != NULL) {
scheduler_cooperative_switch_to(exiting_context, next_sandbox);
} else {
scheduler_switch_to_base_context(exiting_context);
}
}
static inline bool
scheduler_worker_would_preempt(int worker_idx)
{
assert(scheduler == SCHEDULER_EDF);
uint64_t local_deadline = runtime_worker_threads_deadline[worker_idx];
uint64_t global_deadline = global_request_scheduler_peek();
return global_deadline < local_deadline;
}