/*
 * Copyright 2011-2012 Samy Al Bahra.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * The implementation here is inspired from the work described in:
 *   Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
 *   of Cambridge Computing Laboratory.
 */

#include <ck_backoff.h>
#include <ck_cc.h>
#include <ck_epoch.h>
#include <ck_pr.h>
#include <ck_stack.h>
#include <stdbool.h>

/*
 * Only three distinct epoch values are needed. If any thread is in
 * a "critical section" then it would have acquired some snapshot (e)
 * of the global epoch value (e_g) and set an active flag. Any hazardous
 * references will only occur after a full memory barrier. For example,
 * assume an initial e_g value of 1, e value of 0 and active value of 0.
 *
 * ck_epoch_begin(...)
 *   e = e_g
 *   active = 1
 *   memory_barrier();
 *
 * Any serialized reads may observe e = 0 or e = 1 with active = 0, or
 * e = 0 or e = 1 with active = 1. The e_g value can only go from 1
 * to 2 if every thread has already observed the value of "1" (or the
 * value we are incrementing from). This guarantees us that for any
 * given value e_g, any threads with-in critical sections (referred
 * to as "active" threads from here on) would have an e value of
 * e_g - 1 or e_g. This also means that hazardous references may be
 * shared in both e_g - 1 and e_g even if they are logically deleted
 * in e_g.
 *
 * For example, assume all threads have an e value of e_g. Another
 * thread may increment to e_g to e_g + 1. Older threads may have
 * a reference to an object which is only deleted in e_g + 1. It
 * could be that reader threads are executing some hash table look-ups,
 * while some other writer thread (which causes epoch counter tick)
 * actually deletes the same items that reader threads are looking
 * up (this writer thread having an e value of e_g + 1). This is possible
 * if the writer thread re-observes the epoch after the counter tick.
 *
 * Psuedo-code for writer:
 *   ck_epoch_begin()
 *   ht_delete(x)
 *   ck_epoch_end()
 *   ck_epoch_begin()
 *   ht_delete(x)
 *   ck_epoch_end()
 *
 * Psuedo-code for reader:
 *   for (;;) {
 *      x = ht_lookup(x)
 *      ck_pr_inc(&x->value);
 *   }
 *
 * Of course, it is also possible for references logically deleted
 * at e_g - 1 to still be accessed at e_g as threads are "active"
 * at the same time (real-world time) mutating shared objects.
 *
 * Now, if the epoch counter is ticked to e_g + 1, then no new 
 * hazardous references could exist to objects logically deleted at
 * e_g - 1. The reason for this is that at e_g + 1, all epoch read-side
 * critical sections started at e_g - 1 must have been completed. If
 * any epoch read-side critical sections at e_g - 1 were still active,
 * then we would never increment to e_g + 1 (active != 0 ^ e != e_g).
 * Additionally, e_g may still have hazardous references to objects logically
 * deleted at e_g - 1 which means objects logically deleted at e_g - 1 cannot
 * be deleted at e_g + 1 (since it is valid for active threads to be at e_g or
 * e_g + 1 and threads at e_g still require safe memory accesses).
 *
 * However, at e_g + 2, all active threads must be either at e_g + 1 or
 * e_g + 2. Though e_g + 2 may share hazardous references with e_g + 1,
 * and e_g + 1 shares hazardous references to e_g, no active threads are
 * at e_g or e_g - 1. This means no hazardous references could exist to
 * objects deleted at e_g - 1 (at e_g + 2).
 *
 * To summarize these important points,
 *   1) Active threads will always have a value of e_g or e_g - 1.
 *   2) Items that are logically deleted at e_g or e_g - 1 cannot be
 *      physically deleted.
 *   3) Objects logically deleted at e_g - 1 can be physically destroyed
 *      at e_g + 2. In other words, for any current value of the global epoch
 *      counter e_g, objects logically deleted at e_g can be physically
 *      deleted at e_g + 3.
 *
 * Last but not least, if we are at e_g + 2, then no active thread is at
 * e_g which means it is safe to apply modulo-3 arithmetic to e_g value
 * in order to re-use e_g to represent the e_g + 3 state. This means it is
 * sufficient to represent e_g using only the values 0, 1 or 2. Every time
 * a thread re-visits a e_g (which can be determined with a non-empty deferral
 * list) it can assume objects in the e_g deferral list involved at least
 * three e_g transitions and are thus, safe, for physical deletion. 
 *
 * Blocking semantics for epoch reclamation have additional restrictions.
 * Though we only require three deferral lists, reasonable blocking semantics
 * must be able to more gracefully handle bursty write work-loads which could
 * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for
 * easy-to-trigger live-lock situations. The work-around to work around
 * this is to not apply modulo arithmetic to e_g but only to deferral list
 * indexing.
 */
#define CK_EPOCH_GRACE 3U

enum {
	CK_EPOCH_STATE_USED = 0,
	CK_EPOCH_STATE_FREE = 1
};

CK_STACK_CONTAINER(struct ck_epoch_record, record_next, ck_epoch_record_container)
CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container)

void
ck_epoch_init(struct ck_epoch *global)
{

	ck_stack_init(&global->records);
	global->epoch = 1;
	global->n_free = 0;
	ck_pr_fence_store();
	return;
}

struct ck_epoch_record *
ck_epoch_recycle(struct ck_epoch *global)
{
	struct ck_epoch_record *record;
	ck_stack_entry_t *cursor;
	unsigned int state;

	if (ck_pr_load_uint(&global->n_free) == 0)
		return (NULL);

	CK_STACK_FOREACH(&global->records, cursor) {
		record = ck_epoch_record_container(cursor);

		if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) {
			ck_pr_fence_load();
			state = ck_pr_fas_uint(&record->state, CK_EPOCH_STATE_USED);
			if (state == CK_EPOCH_STATE_FREE) {
				ck_pr_dec_uint(&global->n_free);
				return record;
			}
		}
	}

	return NULL;
}

void
ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record)
{
	size_t i;

	record->state = CK_EPOCH_STATE_USED;
	record->active = 0;
	record->epoch = 0;
	record->n_dispatch = 0;
	record->n_peak = 0;
	record->n_pending = 0;

	for (i = 0; i < CK_EPOCH_LENGTH; i++)
		ck_stack_init(&record->pending[i]);

	ck_pr_fence_store();
	ck_stack_push_upmc(&global->records, &record->record_next);
	return;
}

void
ck_epoch_unregister(struct ck_epoch_record *record)
{
	size_t i;

	record->active = 0;
	record->epoch = 0;
	record->n_dispatch = 0;
	record->n_peak = 0;
	record->n_pending = 0;

	for (i = 0; i < CK_EPOCH_LENGTH; i++)
		ck_stack_init(&record->pending[i]);

	ck_pr_fence_store();
	ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
	return;
}

static struct ck_epoch_record *
ck_epoch_scan(struct ck_epoch *global, struct ck_epoch_record *cr, unsigned int epoch)
{
	ck_stack_entry_t *cursor;

	if (cr == NULL) {
		cursor = CK_STACK_FIRST(&global->records);
	} else {
		cursor = &cr->record_next;
	}

	while (cursor != NULL) {
		unsigned int state;

		cr = ck_epoch_record_container(cursor);

		state = ck_pr_load_uint(&cr->state);
		if (state & CK_EPOCH_STATE_FREE)
			continue;

		if (ck_pr_load_uint(&cr->active) != 0 &&
		    ck_pr_load_uint(&cr->epoch) != epoch)
			return cr;

		cursor = CK_STACK_NEXT(cursor);
	}

	return NULL;
}

static void
ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e)
{
	unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
	ck_stack_entry_t *next, *cursor;
	unsigned int i = 0;

	CK_STACK_FOREACH_SAFE(&record->pending[epoch], cursor, next) {
		struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor);

		entry->function(entry);
		i++;
	}

	if (record->n_pending > record->n_peak)
		record->n_peak = record->n_pending;

	record->n_dispatch += i;
	record->n_pending -= i;
	ck_stack_init(&record->pending[epoch]);
	return;
}

/*
 * This function must not be called with-in read section.
 */
void
ck_epoch_barrier(struct ck_epoch *global, struct ck_epoch_record *record)
{
	struct ck_epoch_record *cr;
	unsigned int delta, epoch, goal, i;

	/*
	 * Guarantee any mutations previous to the barrier will be made visible
	 * with respect to epoch snapshots we will read.
	 */
	ck_pr_fence_memory();

	delta = epoch = ck_pr_load_uint(&global->epoch);
	goal = epoch + CK_EPOCH_GRACE;

	for (i = 0, cr = NULL; i < CK_EPOCH_GRACE; cr = NULL, i++) {
		/* Determine whether all threads have observed the current epoch. */
		while (cr = ck_epoch_scan(global, cr, delta), cr != NULL)
			ck_pr_stall();

		/*
		 * Increment current epoch. CAS semantics are used to eliminate
		 * increment operations for synchronization that occurs for the
		 * same global epoch value snapshot.
		 *
		 * If we can guarantee there will only be one active barrier
		 * or epoch tick at a given time, then it is sufficient to
		 * use an increment operation. In a multi-barrier workload,
		 * however, it is possible to overflow the epoch value if we
		 * apply modulo-3 arithmetic.
		 */
		ck_pr_cas_uint_value(&global->epoch, delta, delta + 1, &delta);

		/* Right now, epoch overflow is handled as an edge case. */
		if ((goal > epoch) & (delta > goal))
			break;
	}

	/*
	 * As the synchronize operation is non-blocking, it is possible other
	 * writers have already observed three or more epoch generations
	 * relative to the generation the caller has observed. In this case,
	 * it is safe to assume we are also in a grace period and are able to
	 * dispatch all calls across all lists.
	 */
	for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
		ck_epoch_dispatch(record, epoch);

	record->epoch = delta;
	return;
}

void
ck_epoch_synchronize(struct ck_epoch *global, struct ck_epoch_record *record)
{

	ck_epoch_barrier(global, record);
	return;
}

bool
ck_epoch_poll(struct ck_epoch *global, struct ck_epoch_record *record)
{
	unsigned int epoch = ck_pr_load_uint(&global->epoch);
	unsigned int snapshot;
	struct ck_epoch_record *cr = NULL;

	cr = ck_epoch_scan(global, cr, epoch);
	if (cr != NULL)
		return false;

	ck_pr_cas_uint_value(&global->epoch, epoch, epoch + 1, &snapshot);
	ck_epoch_dispatch(record, epoch + 1);
	record->epoch = snapshot;
	return true;
}