Merge branch 'master' of https://github.com/sbahra/ck

12 years ago · cbfa095108
parent f19a1f3c23 0f08a33278
commit cbfa095108
17 changed files with 298 additions and 277 deletions
--- a/.gitignore
+++ b/.gitignore
@ -140,6 +140,7 @@ regressions/ck_brlock/benchmark/throughput
 regressions/ck_rwlock/benchmark/throughput
 regressions/ck_queue/validate/ck_list
 regressions/ck_queue/validate/ck_slist
+regressions/ck_queue/validate/ck_stailq
 regressions/ck_cohort/validate/validate
 regressions/ck_cohort/benchmark/ck_cohort.LATENCY
 regressions/ck_cohort/benchmark/ck_cohort.THROUGHPUT
--- a/include/ck_brlock.h
+++ b/include/ck_brlock.h
@ -83,7 +83,7 @@ ck_brlock_write_lock(struct ck_brlock *br)
 	while (ck_pr_fas_uint(&br->writer, true) == true)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	/* The reader list is protected under the writer br. */
 	for (cursor = br->readers; cursor != NULL; cursor = cursor->next) {
@ -121,7 +121,7 @@ ck_brlock_write_trylock(struct ck_brlock *br, unsigned int factor)
 	 * We do not require a strict fence here as atomic RMW operations
 	 * are serializing.
 	 */
-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	for (cursor = br->readers; cursor != NULL; cursor = cursor->next) {
 		while (ck_pr_load_uint(&cursor->n_readers) != 0) {
@ -190,13 +190,19 @@ ck_brlock_read_lock(struct ck_brlock *br, struct ck_brlock_reader *reader)
 #if defined(__x86__) || defined(__x86_64__)
 		ck_pr_fas_uint(&reader->n_readers, 1);

-		/* Serialize counter update with respect to writer snapshot. */
-		ck_pr_fence_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_atomic_load();
 #else
 		ck_pr_store_uint(&reader->n_readers, 1);

-		/* Loads can be re-ordered before previous stores, even on TSO. */
-		ck_pr_fence_strict_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_store_load();
 #endif

 		if (ck_pr_load_uint(&br->writer) == false)
@ -229,10 +235,23 @@ ck_brlock_read_trylock(struct ck_brlock *br,
 			ck_pr_stall();
 		}

+#if defined(__x86__) || defined(__x86_64__)
+		ck_pr_fas_uint(&reader->n_readers, 1);
+
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_atomic_load();
+#else
 		ck_pr_store_uint(&reader->n_readers, 1);

-		/* Loads are re-ordered with respect to prior stores. */
-		ck_pr_fence_strict_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_store_load();
+#endif

 		if (ck_pr_load_uint(&br->writer) == false)
 			break;
--- a/include/ck_bytelock.h
+++ b/include/ck_bytelock.h
@ -93,7 +93,7 @@ ck_bytelock_write_lock(struct ck_bytelock *bytelock, unsigned int slot)
 		ck_pr_store_8(&bytelock->readers[slot - 1], false);

 	/* Wait for slotted readers to drain out. */
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	for (i = 0; i < sizeof(bytelock->readers) / CK_BYTELOCK_LENGTH; i++) {
 		while (CK_BYTELOCK_LOAD((CK_BYTELOCK_TYPE *)&readers[i]) != false)
 			ck_pr_stall();
@ -134,7 +134,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot)
 	if (slot > sizeof bytelock->readers) {
 		for (;;) {
 			ck_pr_inc_uint(&bytelock->n_readers);
-			ck_pr_fence_memory();
+			ck_pr_fence_atomic_load();
 			if (ck_pr_load_uint(&bytelock->owner) == 0)
 				break;
 			ck_pr_dec_uint(&bytelock->n_readers);
@ -150,7 +150,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot)
 	slot -= 1;
 	for (;;) {
 		ck_pr_store_8(&bytelock->readers[slot], true);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();

 		/*
 		 * If there is no owner at this point, our slot has
--- a/include/ck_epoch.h
+++ b/include/ck_epoch.h
@ -97,12 +97,11 @@ ck_epoch_begin(ck_epoch_t *epoch, ck_epoch_record_t *record)
 		/*
 		 * It is possible for loads to be re-ordered before the store
 		 * is committed into the caller's epoch and active fields.
-		 * Execute a full barrier to serialize stores with respect to
-		 * loads
+		 * For this reason, store to load serialization is necessary.
 		 */
 		ck_pr_store_uint(&record->epoch, g_epoch);
 		ck_pr_store_uint(&record->active, 1);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		return;
 	}

--- a/include/ck_hp_fifo.h
+++ b/include/ck_hp_fifo.h
@ -81,7 +81,7 @@ ck_hp_fifo_enqueue_mpmc(ck_hp_record_t *record,
 	for (;;) {
 		tail = ck_pr_load_ptr(&fifo->tail);
 		ck_hp_set(record, 0, tail);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (tail != ck_pr_load_ptr(&fifo->tail))
 			continue;

@ -112,7 +112,7 @@ ck_hp_fifo_tryenqueue_mpmc(ck_hp_record_t *record,

 	tail = ck_pr_load_ptr(&fifo->tail);
 	ck_hp_set(record, 0, tail);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (tail != ck_pr_load_ptr(&fifo->tail))
 		return false;

@ -140,13 +140,13 @@ ck_hp_fifo_dequeue_mpmc(ck_hp_record_t *record,
 		ck_pr_fence_load();
 		tail = ck_pr_load_ptr(&fifo->tail);
 		ck_hp_set(record, 0, head);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (head != ck_pr_load_ptr(&fifo->head))
 			continue;

 		next = ck_pr_load_ptr(&head->next);
 		ck_hp_set(record, 1, next);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (head != ck_pr_load_ptr(&fifo->head))
 			continue;

@ -175,13 +175,13 @@ ck_hp_fifo_trydequeue_mpmc(ck_hp_record_t *record,
 	ck_pr_fence_load();
 	tail = ck_pr_load_ptr(&fifo->tail);
 	ck_hp_set(record, 0, head);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (head != ck_pr_load_ptr(&fifo->head))
 		return NULL;

 	next = ck_pr_load_ptr(&head->next);
 	ck_hp_set(record, 1, next);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (head != ck_pr_load_ptr(&fifo->head))
 		return NULL;

--- a/include/ck_hp_stack.h
+++ b/include/ck_hp_stack.h
@ -62,7 +62,7 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target)
 			return NULL;

 		ck_hp_set(record, 0, entry);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 	} while (entry != ck_pr_load_ptr(&target->head));

 	while (ck_pr_cas_ptr_value(&target->head, entry, entry->next, &entry) == false) {
@ -70,11 +70,11 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target)
 			return NULL;

 		ck_hp_set(record, 0, entry);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		update = ck_pr_load_ptr(&target->head);
 		while (entry != update) {
 			ck_hp_set(record, 0, update);
-			ck_pr_fence_strict_memory();
+			ck_pr_fence_store_load();
 			entry = update;
 			update = ck_pr_load_ptr(&target->head);
 			if (update == NULL)
@ -95,7 +95,7 @@ ck_hp_stack_trypop_mpmc(ck_hp_record_t *record, struct ck_stack *target, struct
 		return false;

 	ck_hp_set(record, 0, entry);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (entry != ck_pr_load_ptr(&target->head))
 		goto leave;

--- a/include/ck_pr.h
+++ b/include/ck_pr.h
@ -30,6 +30,7 @@

 #include <ck_cc.h>
 #include <ck_limits.h>
+#include <ck_md.h>
 #include <ck_stdint.h>
 #include <stdbool.h>

@ -43,12 +44,97 @@
 #include "gcc/ppc64/ck_pr.h"
 #elif defined(__ppc__)
 #include "gcc/ppc/ck_pr.h"
-#elif defined(__GNUC__)
-#include "gcc/ck_pr.h"
-#else
+#elif !defined(__GNUC__)
 #error Your platform is unsupported
 #endif

+#if defined(__GNUC__)
+#include "gcc/ck_pr.h"
+#endif
+
+#define CK_PR_FENCE_EMIT(T)			\
+	CK_CC_INLINE static void		\
+	ck_pr_fence_##T(void)			\
+	{					\
+		ck_pr_fence_strict_##T();	\
+	}
+#define CK_PR_FENCE_NOOP(T)			\
+	CK_CC_INLINE static void		\
+	ck_pr_fence_##T(void)			\
+	{					\
+		return;				\
+	}
+
+/*
+ * None of the currently supported platforms allow for data-dependent
+ * load ordering.
+ */
+CK_PR_FENCE_NOOP(load_depends)
+#define ck_pr_fence_strict_load_depends ck_pr_fence_load_depends
+
+/*
+ * In memory models where atomic operations do not have serializing
+ * effects, atomic read-modify-write operations are modeled as stores.
+ */
+#if defined(CK_MD_RMO)
+/*
+ * Only stores to the same location have a global
+ * ordering.
+ */
+CK_PR_FENCE_EMIT(atomic)
+CK_PR_FENCE_EMIT(atomic_atomic)
+CK_PR_FENCE_EMIT(atomic_load)
+CK_PR_FENCE_EMIT(atomic_store)
+CK_PR_FENCE_EMIT(store_atomic)
+CK_PR_FENCE_EMIT(load_atomic)
+CK_PR_FENCE_EMIT(load_load)
+CK_PR_FENCE_EMIT(load_store)
+CK_PR_FENCE_EMIT(store_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_EMIT(load)
+CK_PR_FENCE_EMIT(store)
+CK_PR_FENCE_EMIT(memory)
+#elif defined(CK_MD_PSO)
+/*
+ * Anything can be re-ordered with respect to stores.
+ * Otherwise, loads are executed in-order.
+ */
+CK_PR_FENCE_EMIT(atomic)
+CK_PR_FENCE_EMIT(atomic_atomic)
+CK_PR_FENCE_NOOP(atomic_load)
+CK_PR_FENCE_EMIT(atomic_store)
+CK_PR_FENCE_EMIT(store_atomic)
+CK_PR_FENCE_NOOP(load_atomic)
+CK_PR_FENCE_NOOP(load_load)
+CK_PR_FENCE_EMIT(load_store)
+CK_PR_FENCE_EMIT(store_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_NOOP(load)
+CK_PR_FENCE_EMIT(store)
+CK_PR_FENCE_EMIT(memory)
+#elif defined(CK_MD_TSO)
+/*
+ * Only loads are re-ordered and only with respect to
+ * prior stores. Atomic operations are serializing.
+ */
+CK_PR_FENCE_NOOP(atomic)
+CK_PR_FENCE_NOOP(atomic_atomic)
+CK_PR_FENCE_NOOP(atomic_load)
+CK_PR_FENCE_NOOP(atomic_store)
+CK_PR_FENCE_NOOP(store_atomic)
+CK_PR_FENCE_NOOP(load_atomic)
+CK_PR_FENCE_NOOP(load_load)
+CK_PR_FENCE_NOOP(load_store)
+CK_PR_FENCE_NOOP(store_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_NOOP(load)
+CK_PR_FENCE_NOOP(store)
+CK_PR_FENCE_NOOP(memory)
+#endif /* CK_MD_TSO */
+
+#undef CK_PR_FENCE_EMIT
+#undef CK_PR_FENCE_NOOP
+
 #define CK_PR_BIN(K, S, M, T, P, C)					\
 	CK_CC_INLINE static void					\
 	ck_pr_##K##_##S(M *target, T value)				\
--- a/include/ck_rwlock.h
+++ b/include/ck_rwlock.h
@ -74,7 +74,8 @@ ck_rwlock_write_trylock(ck_rwlock_t *rw)
 	if (ck_pr_fas_uint(&rw->writer, 1) != 0)
 		return false;

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();
+
 	if (ck_pr_load_uint(&rw->n_readers) != 0) {
 		ck_rwlock_write_unlock(rw);
 		return false;
@ -90,7 +91,7 @@ ck_rwlock_write_lock(ck_rwlock_t *rw)
 	while (ck_pr_fas_uint(&rw->writer, 1) != 0)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	while (ck_pr_load_uint(&rw->n_readers) != 0)
 		ck_pr_stall();
@ -111,16 +112,15 @@ ck_rwlock_read_trylock(ck_rwlock_t *rw)
 	 * Serialize with respect to concurrent write
 	 * lock operation.
 	 */
-	ck_pr_fence_memory();
-	if (ck_pr_load_uint(&rw->writer) == 0)
-		goto leave;
+	ck_pr_fence_atomic_load();
+
+	if (ck_pr_load_uint(&rw->writer) == 0) {
+		ck_pr_fence_load();
+		return true;
+	}
+
 	ck_pr_dec_uint(&rw->n_readers);
 	return false;
-
-leave:
-	/* Acquire semantics are necessary. */
-	ck_pr_fence_load();
-	return true;
 }

 CK_CC_INLINE static void
@ -137,7 +137,8 @@ ck_rwlock_read_lock(ck_rwlock_t *rw)
 		 * Serialize with respect to concurrent write
 		 * lock operation.
 		 */
-		ck_pr_fence_memory();
+		ck_pr_fence_atomic_load();
+
 		if (ck_pr_load_uint(&rw->writer) == 0)
 			break;
 		ck_pr_dec_uint(&rw->n_readers);
@ -180,7 +181,7 @@ ck_rwlock_recursive_write_lock(ck_rwlock_recursive_t *rw, unsigned int tid)
 	while (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	while (ck_pr_load_uint(&rw->rw.n_readers) != 0)
 		ck_pr_stall();
@ -202,7 +203,7 @@ ck_rwlock_recursive_write_trylock(ck_rwlock_recursive_t *rw, unsigned int tid)
 	if (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false)
 		return false;

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	if (ck_pr_load_uint(&rw->rw.n_readers) != 0) {
 		ck_pr_store_uint(&rw->rw.writer, 0);
--- a/include/ck_spinlock.h
+++ b/include/ck_spinlock.h
@ -142,7 +142,7 @@ ck_spinlock_anderson_lock(struct ck_spinlock_anderson *lock,

 	/* Prepare slot for potential re-use by another thread. */
 	ck_pr_store_uint(&lock->slots[position].locked, true);
-	ck_pr_fence_store();
+	ck_pr_fence_memory();

 	*slot = lock->slots + position;
 	return;
@ -194,7 +194,7 @@ ck_spinlock_fas_trylock(struct ck_spinlock_fas *lock)
 	if (value == false)
 		ck_pr_fence_memory();

-	return (!value);
+	return !value;
 }

 CK_CC_INLINE static bool
@ -268,7 +268,7 @@ ck_spinlock_cas_trylock(struct ck_spinlock_cas *lock)
 	if (value == false)
 		ck_pr_fence_memory();

-	return (!value);
+	return !value;
 }

 CK_CC_INLINE static bool
@ -658,9 +658,9 @@ CK_CC_INLINE static bool
 ck_spinlock_mcs_trylock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *node)
 {

-	ck_pr_store_uint(&node->locked, true);
-	ck_pr_store_ptr(&node->next, NULL);
-	ck_pr_fence_store();
+	node->locked = true;
+	node->next = NULL;
+	ck_pr_fence_store_atomic();

 	if (ck_pr_cas_ptr(queue, NULL, node) == true) {
 		ck_pr_fence_load();
@ -686,24 +686,24 @@ ck_spinlock_mcs_lock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *nod
 	 * In the case that there is a successor, let them know they must wait
 	 * for us to unlock.
 	 */
-	ck_pr_store_uint(&node->locked, true);
-	ck_pr_store_ptr(&node->next, NULL);
+	node->locked = true;
+	node->next = NULL;
+	ck_pr_fence_store_atomic();

 	/*
 	 * Swap current tail with current lock request. If the swap operation
 	 * returns NULL, it means the queue was empty. If the queue was empty,
 	 * then the operation is complete.
 	 */
-	ck_pr_fence_memory();
 	previous = ck_pr_fas_ptr(queue, node);
-	if (previous == NULL)
-		return;
-
-	/* Let the previous lock holder know that we are waiting on them. */
-	ck_pr_store_ptr(&previous->next, node);
-	while (ck_pr_load_uint(&node->locked) == true)
-		ck_pr_stall();
+	if (previous != NULL) {
+		/* Let the previous lock holder know that we are waiting on them. */
+		ck_pr_store_ptr(&previous->next, node);
+		while (ck_pr_load_uint(&node->locked) == true)
+			ck_pr_stall();
+	}

+	ck_pr_fence_load();
 	return;
 }

@ -712,6 +712,8 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 {
 	struct ck_spinlock_mcs *next;

+	ck_pr_fence_memory();
+
 	next = ck_pr_load_ptr(&node->next);
 	if (next == NULL) {
 		/*
@ -721,7 +723,6 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 		 */
 		if (ck_pr_load_ptr(queue) == node &&
 		    ck_pr_cas_ptr(queue, node, NULL) == true) {
-			ck_pr_fence_memory();
 			return;
 		}

@ -740,9 +741,7 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 	}

 	/* Allow the next lock operation to complete. */
-	ck_pr_fence_memory();
 	ck_pr_store_uint(&next->locked, false);
-
 	return;
 }
 #endif /* CK_F_SPINLOCK_MCS */
--- a/include/gcc/ck_pr.h
+++ b/include/gcc/ck_pr.h
@ -31,9 +31,21 @@
 #error Do not include this file directly, use ck_pr.h
 #endif

+#include <ck_cc.h>
+
+CK_CC_INLINE static void
+ck_pr_barrier(void)
+{
+
+	__asm__ __volatile__("" ::: "memory");
+	return;
+}
+
+#ifndef CK_F_PR
+#define CK_F_PR
+
 #include <stdbool.h>
 #include <ck_stdint.h>
-#include <ck_cc.h>

 /*
 * The following represent supported atomic operations.
@ -93,45 +105,32 @@ ck_pr_stall(void)
 	return;
 }

-/*
- * Most target architectures do not require this.
- */
-CK_CC_INLINE static void
-ck_pr_fence_load_depends(void)
-{
-
-	__sync_synchronize();
-	return;
-}
-
 /*
 * Load and store fences are equivalent to full fences in the GCC port.
 */
 #define CK_PR_FENCE(T)					\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__sync_synchronize();			\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
 	{						\
 		__sync_synchronize();			\
 	}

+CK_PR_FENCE(atomic)
+CK_PR_FENCE(atomic_atomic)
+CK_PR_FENCE(atomic_load)
+CK_PR_FENCE(atomic_store)
+CK_PR_FENCE(store_atomic)
+CK_PR_FENCE(load_atomic)
 CK_PR_FENCE(load)
+CK_PR_FENCE(load_load)
+CK_PR_FENCE(load_store)
 CK_PR_FENCE(store)
+CK_PR_FENCE(store_store)
+CK_PR_FENCE(store_load)
 CK_PR_FENCE(memory)

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic compare and swap.
 */
@ -275,5 +274,5 @@ CK_PR_UNARY_S(8, uint8_t)

 #undef CK_PR_UNARY_S
 #undef CK_PR_UNARY
-
+#endif /* !CK_F_PR */
 #endif /* _CK_PR_GCC_H */
--- a/include/gcc/ppc/ck_pr.h
+++ b/include/gcc/ppc/ck_pr.h
@ -41,6 +41,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * This bounces the hardware thread from low to medium
 * priority. I am unsure of the benefits of this approach
@ -55,45 +60,29 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
-
-CK_PR_FENCE(load_depends, "")
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}
+
+CK_PR_FENCE(atomic, "lwsync")
+CK_PR_FENCE(atomic_atomic, "lwsync")
+CK_PR_FENCE(atomic_store, "lwsync")
+CK_PR_FENCE(atomic_load, "sync")
+CK_PR_FENCE(store_atomic, "lwsync")
+CK_PR_FENCE(load_atomic, "lwsync")
 CK_PR_FENCE(store, "lwsync")
+CK_PR_FENCE(store_store, "lwsync")
+CK_PR_FENCE(store_load, "sync")
 CK_PR_FENCE(load, "lwsync")
+CK_PR_FENCE(load_load, "lwsync")
+CK_PR_FENCE(load_store, "lwsync")
 CK_PR_FENCE(memory, "sync")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/ppc64/ck_pr.h
+++ b/include/gcc/ppc64/ck_pr.h
@ -40,6 +40,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * This bounces the hardware thread from low to medium
 * priority. I am unsure of the benefits of this approach
@ -54,49 +59,33 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}

 /*
 * These are derived from:
 *     http://www.ibm.com/developerworks/systems/articles/powerpc.html
 */
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(atomic, "lwsync")
+CK_PR_FENCE(atomic_atomic, "lwsync")
+CK_PR_FENCE(atomic_store, "lwsync")
+CK_PR_FENCE(atomic_load, "sync")
+CK_PR_FENCE(store_atomic, "lwsync")
+CK_PR_FENCE(load_atomic, "lwsync")
 CK_PR_FENCE(store, "lwsync")
+CK_PR_FENCE(store_store, "lwsync")
+CK_PR_FENCE(store_load, "sync")
 CK_PR_FENCE(load, "lwsync")
+CK_PR_FENCE(load_load, "lwsync")
+CK_PR_FENCE(load_store, "lwsync")
 CK_PR_FENCE(memory, "sync")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/sparcv9/ck_pr.h
+++ b/include/gcc/sparcv9/ck_pr.h
@ -40,6 +40,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * Order loads at the least.
 */
@ -51,51 +56,33 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-/*
- * If RMO is forced, then do not assume TSO model.
- */
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}
+
 /*
- * By default, we will assume TSO model is used on SPARCv9.
+ * Atomic operations are treated as both load and store
+ * operations on SPARCv9.
 */
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
-
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(atomic_atomic, "membar #StoreStore")
+CK_PR_FENCE(atomic, "membar #StoreStore")
+CK_PR_FENCE(atomic_store, "membar #StoreStore")
+CK_PR_FENCE(atomic_load, "membar #StoreLoad")
+CK_PR_FENCE(store_atomic, "membar #StoreStore")
+CK_PR_FENCE(load_atomic, "membar #LoadStore")
 CK_PR_FENCE(store, "membar #StoreStore")
+CK_PR_FENCE(store_store, "membar #StoreStore")
+CK_PR_FENCE(store_load, "membar #StoreLoad")
 CK_PR_FENCE(load, "membar #LoadLoad")
+CK_PR_FENCE(load_load, "membar #LoadLoad")
+CK_PR_FENCE(load_store, "membar #LoadStore")
 CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/x86/ck_pr.h
+++ b/include/gcc/x86/ck_pr.h
@ -63,52 +63,29 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
 #define CK_PR_FENCE(T, I)				\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
 	{						\
 		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}
-#else
-/*
- * IA32 has strong memory ordering guarantees, so memory
- * fences are enabled if and only if the user specifies that
- * that the program will be using non-temporal instructions.
- * Otherwise, an optimization barrier is used in order to prevent
- * compiler re-ordering of loads and stores across the barrier.
- */
-#define CK_PR_FENCE(T, I)				\
-	CK_CC_INLINE static void			\
-	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__("" ::: "memory");	\
 	}
-#endif /* !CK_MD_RMO && !CK_MD_PSO */

+CK_PR_FENCE(atomic, "sfence")
+CK_PR_FENCE(atomic_atomic, "sfence")
+CK_PR_FENCE(atomic_store, "sfence")
+CK_PR_FENCE(atomic_load, "mfence")
+CK_PR_FENCE(store_atomic, "sfence")
+CK_PR_FENCE(load_atomic, "mfence")
 CK_PR_FENCE(load, "lfence")
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(load_load, "lfence")
+CK_PR_FENCE(load_store, "mfence")
 CK_PR_FENCE(store, "sfence")
+CK_PR_FENCE(store_store, "sfence")
+CK_PR_FENCE(store_load, "mfence")
 CK_PR_FENCE(memory, "mfence")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic fetch-and-store operations.
 */
--- a/include/gcc/x86_64/ck_pr.h
+++ b/include/gcc/x86_64/ck_pr.h
@ -62,52 +62,27 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
 #define CK_PR_FENCE(T, I)				\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
 	{						\
 		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}
-#else
-/*
- * IA32 has strong memory ordering guarantees, so memory
- * fences are enabled if and only if the user specifies that
- * that the program will be using non-temporal instructions.
- * Otherwise, an optimization barrier is used in order to prevent
- * compiler re-ordering of loads and stores across the barrier.
- */
-#define CK_PR_FENCE(T, I)				\
-	CK_CC_INLINE static void			\
-	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__("" ::: "memory");	\
 	}
-#endif /* !CK_MD_RMO && !CK_MD_PSO */

+CK_PR_FENCE(atomic_store, "sfence")
+CK_PR_FENCE(atomic_load, "mfence")
+CK_PR_FENCE(store_atomic, "sfence")
+CK_PR_FENCE(load_atomic, "mfence")
 CK_PR_FENCE(load, "lfence")
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(load_load, "lfence")
+CK_PR_FENCE(load_store, "mfence")
 CK_PR_FENCE(store, "sfence")
+CK_PR_FENCE(store_store, "sfence")
+CK_PR_FENCE(store_load, "mfence")
 CK_PR_FENCE(memory, "mfence")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic fetch-and-store operations.
 */
--- a/regressions/ck_pr/benchmark/Makefile
+++ b/regressions/ck_pr/benchmark/Makefile
@ -3,16 +3,16 @@
 all: ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2

 ck_pr_cas_64_2: ck_pr_cas_64_2.c
-	$(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c
+	$(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c -lm

 ck_pr_cas_64: ck_pr_cas_64.c
-	$(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c
+	$(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c -lm

 ck_pr_fas_64: ck_pr_fas_64.c
-	$(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c
+	$(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c -lm

 clean:
 	rm -rf ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 *.dSYM *.exe

 include ../../../build/regressions.build
-CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm
+CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE
--- a/regressions/ck_spinlock/benchmark/Makefile
+++ b/regressions/ck_spinlock/benchmark/Makefile
@ -14,67 +14,67 @@ OBJECTS=ck_ticket.THROUGHPUT ck_ticket.LATENCY			\
 all: $(OBJECTS)

 ck_spinlock.THROUGHPUT: ck_spinlock.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c -lm

 ck_spinlock.LATENCY: ck_spinlock.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c -lm

 ck_ticket.THROUGHPUT: ck_ticket.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c -lm

 ck_ticket.LATENCY: ck_ticket.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c -lm

 ck_mcs.THROUGHPUT: ck_mcs.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c -lm

 ck_mcs.LATENCY: ck_mcs.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c -lm

 ck_dec.THROUGHPUT: ck_dec.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c -lm

 ck_dec.LATENCY: ck_dec.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c -lm

 ck_cas.THROUGHPUT: ck_cas.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c -lm

 ck_cas.LATENCY: ck_cas.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c -lm

 ck_fas.THROUGHPUT: ck_fas.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c -lm

 ck_fas.LATENCY: ck_fas.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c -lm

 ck_clh.THROUGHPUT: ck_clh.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c -lm

 ck_clh.LATENCY: ck_clh.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c -lm

 linux_spinlock.THROUGHPUT: linux_spinlock.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c -lm

 linux_spinlock.LATENCY: linux_spinlock.c
-	$(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c
+	$(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c -lm

 ck_ticket_pb.THROUGHPUT: ck_ticket_pb.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c -lm

 ck_ticket_pb.LATENCY: ck_ticket_pb.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c -lm

 ck_anderson.THROUGHPUT: ck_anderson.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c -lm

 ck_anderson.LATENCY: ck_anderson.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c -lm

 clean:
 	rm -rf *.dSYM *.exe $(OBJECTS)

 include ../../../build/regressions.build
-CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm
+CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE