#pragma once #include #include #include #include "current_sandbox.h" #include "global_request_scheduler.h" #include "global_request_scheduler_deque.h" #include "global_request_scheduler_minheap.h" #include "global_request_scheduler_mtds.h" #include "local_cleanup_queue.h" #include "local_runqueue.h" #include "local_runqueue_list.h" #include "local_runqueue_minheap.h" #include "local_runqueue_mtds.h" #include "panic.h" #include "sandbox_functions.h" #include "sandbox_set_as_interrupted.h" #include "sandbox_set_as_preempted.h" #include "sandbox_set_as_runnable.h" #include "sandbox_set_as_running_sys.h" #include "sandbox_set_as_running_user.h" #include "sandbox_types.h" #include "scheduler_options.h" /** * This scheduler provides for cooperative and preemptive multitasking in a OS process's userspace. * * When executing cooperatively, the scheduler is directly invoked via `scheduler_cooperative_sched`. It runs a single * time in the existing context in order to try to execute a direct sandbox-to-sandbox switch. When no sandboxes are * available to execute, the scheduler executes a context switch to `worker_thread_base_context`, which calls * `scheduler_cooperative_sched` in an infinite idle loop. If the scheduler needs to restore a sandbox that was * previously preempted, it raises a SIGUSR1 signal to enter the scheduler handler to be able to restore the full * mcontext structure saved during the last preemption. Otherwise, the cooperative scheduler triggers a "fast switch", * which only updates the instruction and stack pointer. * * Preemptive scheduler is provided by POSIX timers using a set interval defining a scheduling quantum. Our signal * handler is configured to mask nested signals. Given that POSIX specifies that the kernel only delivers a SIGALRM to a * single thread, the lucky thread that receives the kernel thread has the responsibility of propagating this signal * onto all other worker threads. This must occur even when a worker thread is running a sandbox in a nonpreemptable * state. * * When a SIGALRM fires, a worker can be in one of four states: * * 1) "Running a signal handler" - We mask signals when we are executing a signal handler, which results in signals * being ignored. A kernel signal should get delivered to another unmasked worker, so propagation still occurs. * * 2) "Running the Cooperative Scheduler" - This is signified by the thread local current_sandbox being set to NULL. We * propagate the signal and return immediately because we know we're already in the scheduler. We have no sandboxes to * interrupt, so no sandbox state transitions occur. * * 3) "Running a Sandbox in a state other than SANDBOX_RUNNING_USER" - We call sandbox_interrupt on current_sandbox, * propagate the sigalrms to the other workers, defer the sigalrm locally, and then return. The SANDBOX_INTERRUPTED * timekeeping data is increased to account for the time needed to propagate the sigalrms. * * 4) "Running a Sandbox in the SANDBOX_RUNNING_USER state - We call sandbox_interrupt on current_sandbox, propagate * the sigalrms to the other workers, and then actually enter the scheduler via scheduler_preemptive_sched. The * interrupted sandbox may either be preempted or return to depending on the scheduler. If preempted, the interrupted * mcontext is saved to the sandbox structure. The SANDBOX_INTERRUPTED timekeeping data is increased to account for the * time needed to propagate the sigalrms, run epoll, query the scheduler data structure, and (potentially) allocate and * initialize a sandbox. */ static inline struct sandbox * scheduler_mtdbf_get_next() { return NULL; } static inline struct sandbox * scheduler_mtds_get_next() { /* Get the deadline of the sandbox at the head of the local queue */ struct sandbox *local = local_runqueue_get_next(); uint64_t local_deadline = local == NULL ? UINT64_MAX : local->absolute_deadline; enum MULTI_TENANCY_CLASS local_mt_class = MT_DEFAULT; struct sandbox *global = NULL; if (local) local_mt_class = local->tenant->pwt_sandboxes[worker_thread_idx].mt_class; uint64_t global_guaranteed_deadline = global_request_scheduler_mtds_guaranteed_peek(); uint64_t global_default_deadline = global_request_scheduler_mtds_default_peek(); /* Try to pull and allocate from the global queue if earlier * This will be placed at the head of the local runqueue */ switch (local_mt_class) { case MT_GUARANTEED: if (global_guaranteed_deadline >= local_deadline) goto done; break; case MT_DEFAULT: if (global_guaranteed_deadline == UINT64_MAX && global_default_deadline >= local_deadline) goto done; break; } if (global_request_scheduler_mtds_remove_with_mt_class(&global, local_deadline, local_mt_class) == 0) { assert(global != NULL); sandbox_prepare_execution_environment(global); assert(global->state == SANDBOX_INITIALIZED); sandbox_set_as_runnable(global, SANDBOX_INITIALIZED); } /* Return what is at the head of the local runqueue or NULL if empty */ done: return local_runqueue_get_next(); } static inline struct sandbox * scheduler_sjf_get_next() { struct sandbox *local = local_runqueue_get_next(); uint64_t local_rem_exec = local == NULL ? UINT64_MAX : local->remaining_exec; struct sandbox *global = NULL; uint64_t global_remaining_exec = global_request_scheduler_peek(); /* Try to pull and allocate from the global queue if earlier * This will be placed at the head of the local runqueue */ if (global_remaining_exec < local_rem_exec) { if (global_request_scheduler_remove_if_earlier(&global, local_rem_exec) == 0) { assert(global != NULL); assert(global->remaining_exec < local_rem_exec); sandbox_prepare_execution_environment(global); assert(global->state == SANDBOX_INITIALIZED); sandbox_set_as_runnable(global, SANDBOX_INITIALIZED); } } /* Return what is at the head of the local runqueue or NULL if empty */ return local_runqueue_get_next(); } static inline struct sandbox * scheduler_edf_get_next() { /* Get the deadline of the sandbox at the head of the local queue */ struct sandbox *local = local_runqueue_get_next(); uint64_t local_deadline = local == NULL ? UINT64_MAX : local->absolute_deadline; struct sandbox *global = NULL; uint64_t global_deadline = global_request_scheduler_peek(); /* Try to pull and allocate from the global queue if earlier * This will be placed at the head of the local runqueue */ if (global_deadline < local_deadline) { if (global_request_scheduler_remove_if_earlier(&global, local_deadline) == 0) { assert(global != NULL); assert(global->absolute_deadline < local_deadline); sandbox_prepare_execution_environment(global); assert(global->state == SANDBOX_INITIALIZED); sandbox_set_as_runnable(global, SANDBOX_INITIALIZED); } } /* Return what is at the head of the local runqueue or NULL if empty */ return local_runqueue_get_next(); } static inline struct sandbox * scheduler_fifo_get_next() { struct sandbox *local = local_runqueue_get_next(); struct sandbox *global = NULL; if (local == NULL) { /* If the local runqueue is empty, pull from global request scheduler */ if (global_request_scheduler_remove(&global) < 0) goto done; sandbox_prepare_execution_environment(global); sandbox_set_as_runnable(global, SANDBOX_INITIALIZED); } else if (local == current_sandbox_get()) { /* Execute Round Robin Scheduling Logic if the head is the current sandbox */ local_runqueue_list_rotate(); } done: return local_runqueue_get_next(); } static inline struct sandbox * scheduler_get_next() { switch (scheduler) { case SCHEDULER_MTDBF: return scheduler_mtdbf_get_next(); case SCHEDULER_MTDS: return scheduler_mtds_get_next(); case SCHEDULER_SJF: return scheduler_sjf_get_next(); case SCHEDULER_EDF: return scheduler_edf_get_next(); case SCHEDULER_FIFO: return scheduler_fifo_get_next(); default: panic("Unimplemented\n"); } } static inline void scheduler_initialize() { switch (scheduler) { case SCHEDULER_MTDBF: /* TODO: loading */ break; case SCHEDULER_MTDS: global_request_scheduler_mtds_initialize(); break; case SCHEDULER_EDF: case SCHEDULER_SJF: global_request_scheduler_minheap_initialize(); break; case SCHEDULER_FIFO: global_request_scheduler_deque_initialize(); break; default: panic("Invalid scheduler policy: %u\n", scheduler); } } static inline void scheduler_runqueue_initialize() { switch (scheduler) { case SCHEDULER_MTDBF: // local_runqueue_mtdbf_initialize(); break; case SCHEDULER_MTDS: local_runqueue_mtds_initialize(); break; case SCHEDULER_EDF: case SCHEDULER_SJF: local_runqueue_minheap_initialize(); break; case SCHEDULER_FIFO: local_runqueue_list_initialize(); break; default: panic("Invalid scheduler policy: %u\n", scheduler); } } static inline char * scheduler_print(enum SCHEDULER variant) { switch (variant) { case SCHEDULER_FIFO: return "FIFO"; case SCHEDULER_EDF: return "EDF"; case SCHEDULER_SJF: return "SJF"; case SCHEDULER_MTDS: return "MTDS"; case SCHEDULER_MTDBF: return "MTDBF"; } } static inline void scheduler_log_sandbox_switch(struct sandbox *current_sandbox, struct sandbox *next_sandbox) { #ifdef LOG_CONTEXT_SWITCHES if (current_sandbox == NULL) { /* Switching from "Base Context" */ debuglog("Base Context (@%p) (%s) > Sandbox %lu (@%p) (%s)\n", &worker_thread_base_context, arch_context_variant_print(worker_thread_base_context.variant), next_sandbox->id, &next_sandbox->ctxt, arch_context_variant_print(next_sandbox->ctxt.variant)); } else if (next_sandbox == NULL) { debuglog("Sandbox %lu (@%p) (%s) > Base Context (@%p) (%s)\n", current_sandbox->id, ¤t_sandbox->ctxt, arch_context_variant_print(current_sandbox->ctxt.variant), &worker_thread_base_context, arch_context_variant_print(worker_thread_base_context.variant)); } else { debuglog("Sandbox %lu (@%p) (%s) > Sandbox %lu (@%p) (%s)\n", current_sandbox->id, ¤t_sandbox->ctxt, arch_context_variant_print(current_sandbox->ctxt.variant), next_sandbox->id, &next_sandbox->ctxt, arch_context_variant_print(next_sandbox->ctxt.variant)); } #endif } static inline void scheduler_preemptive_switch_to(ucontext_t *interrupted_context, struct sandbox *next) { /* Switch to next sandbox */ switch (next->ctxt.variant) { case ARCH_CONTEXT_VARIANT_FAST: { assert(next->state == SANDBOX_RUNNABLE); arch_context_restore_fast(&interrupted_context->uc_mcontext, &next->ctxt); current_sandbox_set(next); sandbox_set_as_running_sys(next, SANDBOX_RUNNABLE); break; } case ARCH_CONTEXT_VARIANT_SLOW: { assert(next->state == SANDBOX_PREEMPTED); arch_context_restore_slow(&interrupted_context->uc_mcontext, &next->ctxt); current_sandbox_set(next); sandbox_set_as_running_user(next, SANDBOX_PREEMPTED); break; } default: { panic("Unexpectedly tried to switch to a context in %s state\n", arch_context_variant_print(next->ctxt.variant)); } } } /** * Call either at preemptions or blockings to update the scheduler-specific * properties for the given tenant. */ static inline void scheduler_process_policy_specific_updates_on_interrupts(struct sandbox *interrupted_sandbox) { switch (scheduler) { case SCHEDULER_FIFO: return; case SCHEDULER_EDF: case SCHEDULER_SJF: return; case SCHEDULER_MTDS: local_timeout_queue_process_promotions(); return; case SCHEDULER_MTDBF: // scheduler_check_messages_from_listener(); if (interrupted_sandbox->state != SANDBOX_ERROR) { sandbox_process_scheduler_updates(interrupted_sandbox); } return; } } /** * Called by the SIGALRM handler after a quantum * Assumes the caller validates that there is something to preempt * @param interrupted_context - The context of our user-level Worker thread * @returns the sandbox that the scheduler chose to run */ static inline void scheduler_preemptive_sched(ucontext_t *interrupted_context) { assert(interrupted_context != NULL); /* Process epoll to make sure that all runnable jobs are considered for execution */ struct sandbox *interrupted_sandbox = current_sandbox_get(); assert(interrupted_sandbox != NULL); assert(interrupted_sandbox->state == SANDBOX_INTERRUPTED); // printf ("Worker #%d interrupted sandbox #%lu\n", worker_thread_idx, interrupted_sandbox->id); scheduler_process_policy_specific_updates_on_interrupts(interrupted_sandbox); struct sandbox *next = scheduler_get_next(); /* Assumption: the current sandbox is on the runqueue, so the scheduler should always return something */ assert(next != NULL); /* If current equals next, no switch is necessary, so resume execution */ if (interrupted_sandbox == next) { sandbox_interrupt_return(interrupted_sandbox, SANDBOX_RUNNING_USER); return; } #ifdef LOG_PREEMPTION debuglog("Preempting sandbox %lu to run sandbox %lu\n", interrupted_sandbox->id, next->id); #endif /* Preempt executing sandbox */ scheduler_log_sandbox_switch(interrupted_sandbox, next); sandbox_preempt(interrupted_sandbox); // Write back global at idx 0 wasm_globals_set_i64(&interrupted_sandbox->globals, 0, sledge_abi__current_wasm_module_instance.abi.wasmg_0, true); arch_context_save_slow(&interrupted_sandbox->ctxt, &interrupted_context->uc_mcontext); scheduler_preemptive_switch_to(interrupted_context, next); } /** * @brief Switches to the next sandbox * Assumption: only called by the "base context" * @param next_sandbox The Sandbox to switch to */ static inline void scheduler_cooperative_switch_to(struct arch_context *current_context, struct sandbox *next_sandbox) { assert(current_sandbox_get() == NULL); struct arch_context *next_context = &next_sandbox->ctxt; /* Switch to next sandbox */ switch (next_sandbox->state) { case SANDBOX_RUNNABLE: { assert(next_context->variant == ARCH_CONTEXT_VARIANT_FAST); current_sandbox_set(next_sandbox); sandbox_set_as_running_sys(next_sandbox, SANDBOX_RUNNABLE); break; } case SANDBOX_PREEMPTED: { assert(next_context->variant == ARCH_CONTEXT_VARIANT_SLOW); current_sandbox_set(next_sandbox); /* arch_context_switch triggers a SIGUSR1, which transitions next_sandbox to running_user */ break; } default: { panic("Unexpectedly tried to switch to a sandbox in %s state\n", sandbox_state_stringify(next_sandbox->state)); } } arch_context_switch(current_context, next_context); } static inline void scheduler_switch_to_base_context(struct arch_context *current_context) { /* Assumption: Base Worker context should never be preempted */ assert(worker_thread_base_context.variant == ARCH_CONTEXT_VARIANT_FAST); arch_context_switch(current_context, &worker_thread_base_context); } /* The idle_loop is executed by the base_context. This should not be called directly */ static inline void scheduler_idle_loop() { while (true) { /* Assumption: only called by the "base context" */ assert(current_sandbox_get() == NULL); /* Deferred signals should have been cleared by this point */ assert(deferred_sigalrm == 0); /* Switch to a sandbox if one is ready to run */ struct sandbox *next_sandbox = scheduler_get_next(); if (next_sandbox != NULL) { scheduler_cooperative_switch_to(&worker_thread_base_context, next_sandbox); } /* Clear the cleanup queue */ local_cleanup_queue_free(); /* Improve the performance of spin-wait loops (works only if preemptions enabled) */ if (runtime_worker_spinloop_pause_enabled) pause(); } } /** * @brief Used to cooperative switch sandboxes when a sandbox sleeps or exits * Because of use-after-free bugs that interfere with our loggers, when a sandbox exits and switches away never to * return, the boolean add_to_cleanup_queue needs to be set to true. Otherwise, we will leak sandboxes. * @param add_to_cleanup_queue - Indicates that the sandbox should be added to the cleanup queue before switching * away */ static inline void scheduler_cooperative_sched(bool add_to_cleanup_queue) { struct sandbox *exiting_sandbox = current_sandbox_get(); assert(exiting_sandbox != NULL); /* Clearing current sandbox indicates we are entering the cooperative scheduler */ current_sandbox_set(NULL); barrier(); software_interrupt_deferred_sigalrm_clear(); struct arch_context *exiting_context = &exiting_sandbox->ctxt; /* Assumption: Called by an exiting or sleeping sandbox */ assert(current_sandbox_get() == NULL); /* Deferred signals should have been cleared by this point */ assert(deferred_sigalrm == 0); /* We have not added ourself to the cleanup queue, so we can free */ local_cleanup_queue_free(); /* Switch to a sandbox if one is ready to run */ struct sandbox *next_sandbox = scheduler_get_next(); /* If our sandbox slept and immediately woke up, we can just return */ if (next_sandbox == exiting_sandbox) { sandbox_set_as_running_sys(next_sandbox, SANDBOX_RUNNABLE); current_sandbox_set(next_sandbox); return; } scheduler_log_sandbox_switch(exiting_sandbox, next_sandbox); // Write back global at idx 0 wasm_globals_set_i64(&exiting_sandbox->globals, 0, sledge_abi__current_wasm_module_instance.abi.wasmg_0, true); if (add_to_cleanup_queue) local_cleanup_queue_add(exiting_sandbox); /* Do not touch sandbox struct after this point! */ if (next_sandbox != NULL) { scheduler_cooperative_switch_to(exiting_context, next_sandbox); } else { scheduler_switch_to_base_context(exiting_context); } } static inline bool scheduler_worker_would_preempt(int worker_idx) { assert(scheduler == SCHEDULER_EDF); uint64_t local_deadline = runtime_worker_threads_deadline[worker_idx]; uint64_t global_deadline = global_request_scheduler_peek(); return global_deadline < local_deadline; }