diff --git a/.gitignore b/.gitignore index 05a48ab..276ccc9 100644 --- a/.gitignore +++ b/.gitignore @@ -140,6 +140,7 @@ regressions/ck_brlock/benchmark/throughput regressions/ck_rwlock/benchmark/throughput regressions/ck_queue/validate/ck_list regressions/ck_queue/validate/ck_slist +regressions/ck_queue/validate/ck_stailq regressions/ck_cohort/validate/validate regressions/ck_cohort/benchmark/ck_cohort.LATENCY regressions/ck_cohort/benchmark/ck_cohort.THROUGHPUT diff --git a/include/ck_brlock.h b/include/ck_brlock.h index 7b1f27d..4246c7d 100644 --- a/include/ck_brlock.h +++ b/include/ck_brlock.h @@ -83,7 +83,7 @@ ck_brlock_write_lock(struct ck_brlock *br) while (ck_pr_fas_uint(&br->writer, true) == true) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); /* The reader list is protected under the writer br. */ for (cursor = br->readers; cursor != NULL; cursor = cursor->next) { @@ -121,7 +121,7 @@ ck_brlock_write_trylock(struct ck_brlock *br, unsigned int factor) * We do not require a strict fence here as atomic RMW operations * are serializing. */ - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); for (cursor = br->readers; cursor != NULL; cursor = cursor->next) { while (ck_pr_load_uint(&cursor->n_readers) != 0) { @@ -190,13 +190,19 @@ ck_brlock_read_lock(struct ck_brlock *br, struct ck_brlock_reader *reader) #if defined(__x86__) || defined(__x86_64__) ck_pr_fas_uint(&reader->n_readers, 1); - /* Serialize counter update with respect to writer snapshot. */ - ck_pr_fence_memory(); + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_atomic_load(); #else ck_pr_store_uint(&reader->n_readers, 1); - /* Loads can be re-ordered before previous stores, even on TSO. */ - ck_pr_fence_strict_memory(); + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_store_load(); #endif if (ck_pr_load_uint(&br->writer) == false) @@ -229,10 +235,23 @@ ck_brlock_read_trylock(struct ck_brlock *br, ck_pr_stall(); } +#if defined(__x86__) || defined(__x86_64__) + ck_pr_fas_uint(&reader->n_readers, 1); + + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_atomic_load(); +#else ck_pr_store_uint(&reader->n_readers, 1); - /* Loads are re-ordered with respect to prior stores. */ - ck_pr_fence_strict_memory(); + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_store_load(); +#endif if (ck_pr_load_uint(&br->writer) == false) break; diff --git a/include/ck_bytelock.h b/include/ck_bytelock.h index f73adb2..9d42393 100644 --- a/include/ck_bytelock.h +++ b/include/ck_bytelock.h @@ -93,7 +93,7 @@ ck_bytelock_write_lock(struct ck_bytelock *bytelock, unsigned int slot) ck_pr_store_8(&bytelock->readers[slot - 1], false); /* Wait for slotted readers to drain out. */ - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); for (i = 0; i < sizeof(bytelock->readers) / CK_BYTELOCK_LENGTH; i++) { while (CK_BYTELOCK_LOAD((CK_BYTELOCK_TYPE *)&readers[i]) != false) ck_pr_stall(); @@ -134,7 +134,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot) if (slot > sizeof bytelock->readers) { for (;;) { ck_pr_inc_uint(&bytelock->n_readers); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); if (ck_pr_load_uint(&bytelock->owner) == 0) break; ck_pr_dec_uint(&bytelock->n_readers); @@ -150,7 +150,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot) slot -= 1; for (;;) { ck_pr_store_8(&bytelock->readers[slot], true); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); /* * If there is no owner at this point, our slot has diff --git a/include/ck_epoch.h b/include/ck_epoch.h index 4624bdf..c300a11 100644 --- a/include/ck_epoch.h +++ b/include/ck_epoch.h @@ -97,12 +97,11 @@ ck_epoch_begin(ck_epoch_t *epoch, ck_epoch_record_t *record) /* * It is possible for loads to be re-ordered before the store * is committed into the caller's epoch and active fields. - * Execute a full barrier to serialize stores with respect to - * loads + * For this reason, store to load serialization is necessary. */ ck_pr_store_uint(&record->epoch, g_epoch); ck_pr_store_uint(&record->active, 1); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); return; } diff --git a/include/ck_hp_fifo.h b/include/ck_hp_fifo.h index 41064f3..8c0d08b 100644 --- a/include/ck_hp_fifo.h +++ b/include/ck_hp_fifo.h @@ -81,7 +81,7 @@ ck_hp_fifo_enqueue_mpmc(ck_hp_record_t *record, for (;;) { tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, tail); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (tail != ck_pr_load_ptr(&fifo->tail)) continue; @@ -112,7 +112,7 @@ ck_hp_fifo_tryenqueue_mpmc(ck_hp_record_t *record, tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, tail); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (tail != ck_pr_load_ptr(&fifo->tail)) return false; @@ -140,13 +140,13 @@ ck_hp_fifo_dequeue_mpmc(ck_hp_record_t *record, ck_pr_fence_load(); tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, head); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) continue; next = ck_pr_load_ptr(&head->next); ck_hp_set(record, 1, next); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) continue; @@ -175,13 +175,13 @@ ck_hp_fifo_trydequeue_mpmc(ck_hp_record_t *record, ck_pr_fence_load(); tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, head); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) return NULL; next = ck_pr_load_ptr(&head->next); ck_hp_set(record, 1, next); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) return NULL; diff --git a/include/ck_hp_stack.h b/include/ck_hp_stack.h index 2a7856c..7ac8821 100644 --- a/include/ck_hp_stack.h +++ b/include/ck_hp_stack.h @@ -62,7 +62,7 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target) return NULL; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); } while (entry != ck_pr_load_ptr(&target->head)); while (ck_pr_cas_ptr_value(&target->head, entry, entry->next, &entry) == false) { @@ -70,11 +70,11 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target) return NULL; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); update = ck_pr_load_ptr(&target->head); while (entry != update) { ck_hp_set(record, 0, update); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); entry = update; update = ck_pr_load_ptr(&target->head); if (update == NULL) @@ -95,7 +95,7 @@ ck_hp_stack_trypop_mpmc(ck_hp_record_t *record, struct ck_stack *target, struct return false; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (entry != ck_pr_load_ptr(&target->head)) goto leave; diff --git a/include/ck_pr.h b/include/ck_pr.h index 35540b3..e7c98f7 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -43,12 +44,97 @@ #include "gcc/ppc64/ck_pr.h" #elif defined(__ppc__) #include "gcc/ppc/ck_pr.h" -#elif defined(__GNUC__) -#include "gcc/ck_pr.h" -#else +#elif !defined(__GNUC__) #error Your platform is unsupported #endif +#if defined(__GNUC__) +#include "gcc/ck_pr.h" +#endif + +#define CK_PR_FENCE_EMIT(T) \ + CK_CC_INLINE static void \ + ck_pr_fence_##T(void) \ + { \ + ck_pr_fence_strict_##T(); \ + } +#define CK_PR_FENCE_NOOP(T) \ + CK_CC_INLINE static void \ + ck_pr_fence_##T(void) \ + { \ + return; \ + } + +/* + * None of the currently supported platforms allow for data-dependent + * load ordering. + */ +CK_PR_FENCE_NOOP(load_depends) +#define ck_pr_fence_strict_load_depends ck_pr_fence_load_depends + +/* + * In memory models where atomic operations do not have serializing + * effects, atomic read-modify-write operations are modeled as stores. + */ +#if defined(CK_MD_RMO) +/* + * Only stores to the same location have a global + * ordering. + */ +CK_PR_FENCE_EMIT(atomic) +CK_PR_FENCE_EMIT(atomic_atomic) +CK_PR_FENCE_EMIT(atomic_load) +CK_PR_FENCE_EMIT(atomic_store) +CK_PR_FENCE_EMIT(store_atomic) +CK_PR_FENCE_EMIT(load_atomic) +CK_PR_FENCE_EMIT(load_load) +CK_PR_FENCE_EMIT(load_store) +CK_PR_FENCE_EMIT(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_EMIT(load) +CK_PR_FENCE_EMIT(store) +CK_PR_FENCE_EMIT(memory) +#elif defined(CK_MD_PSO) +/* + * Anything can be re-ordered with respect to stores. + * Otherwise, loads are executed in-order. + */ +CK_PR_FENCE_EMIT(atomic) +CK_PR_FENCE_EMIT(atomic_atomic) +CK_PR_FENCE_NOOP(atomic_load) +CK_PR_FENCE_EMIT(atomic_store) +CK_PR_FENCE_EMIT(store_atomic) +CK_PR_FENCE_NOOP(load_atomic) +CK_PR_FENCE_NOOP(load_load) +CK_PR_FENCE_EMIT(load_store) +CK_PR_FENCE_EMIT(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_NOOP(load) +CK_PR_FENCE_EMIT(store) +CK_PR_FENCE_EMIT(memory) +#elif defined(CK_MD_TSO) +/* + * Only loads are re-ordered and only with respect to + * prior stores. Atomic operations are serializing. + */ +CK_PR_FENCE_NOOP(atomic) +CK_PR_FENCE_NOOP(atomic_atomic) +CK_PR_FENCE_NOOP(atomic_load) +CK_PR_FENCE_NOOP(atomic_store) +CK_PR_FENCE_NOOP(store_atomic) +CK_PR_FENCE_NOOP(load_atomic) +CK_PR_FENCE_NOOP(load_load) +CK_PR_FENCE_NOOP(load_store) +CK_PR_FENCE_NOOP(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_NOOP(load) +CK_PR_FENCE_NOOP(store) +CK_PR_FENCE_NOOP(memory) +#endif /* CK_MD_TSO */ + +#undef CK_PR_FENCE_EMIT +#undef CK_PR_FENCE_NOOP + #define CK_PR_BIN(K, S, M, T, P, C) \ CK_CC_INLINE static void \ ck_pr_##K##_##S(M *target, T value) \ diff --git a/include/ck_rwlock.h b/include/ck_rwlock.h index 45593b0..81587ac 100644 --- a/include/ck_rwlock.h +++ b/include/ck_rwlock.h @@ -74,7 +74,8 @@ ck_rwlock_write_trylock(ck_rwlock_t *rw) if (ck_pr_fas_uint(&rw->writer, 1) != 0) return false; - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); + if (ck_pr_load_uint(&rw->n_readers) != 0) { ck_rwlock_write_unlock(rw); return false; @@ -90,7 +91,7 @@ ck_rwlock_write_lock(ck_rwlock_t *rw) while (ck_pr_fas_uint(&rw->writer, 1) != 0) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); while (ck_pr_load_uint(&rw->n_readers) != 0) ck_pr_stall(); @@ -111,16 +112,15 @@ ck_rwlock_read_trylock(ck_rwlock_t *rw) * Serialize with respect to concurrent write * lock operation. */ - ck_pr_fence_memory(); - if (ck_pr_load_uint(&rw->writer) == 0) - goto leave; + ck_pr_fence_atomic_load(); + + if (ck_pr_load_uint(&rw->writer) == 0) { + ck_pr_fence_load(); + return true; + } + ck_pr_dec_uint(&rw->n_readers); return false; - -leave: - /* Acquire semantics are necessary. */ - ck_pr_fence_load(); - return true; } CK_CC_INLINE static void @@ -137,7 +137,8 @@ ck_rwlock_read_lock(ck_rwlock_t *rw) * Serialize with respect to concurrent write * lock operation. */ - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); + if (ck_pr_load_uint(&rw->writer) == 0) break; ck_pr_dec_uint(&rw->n_readers); @@ -180,7 +181,7 @@ ck_rwlock_recursive_write_lock(ck_rwlock_recursive_t *rw, unsigned int tid) while (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); while (ck_pr_load_uint(&rw->rw.n_readers) != 0) ck_pr_stall(); @@ -202,7 +203,7 @@ ck_rwlock_recursive_write_trylock(ck_rwlock_recursive_t *rw, unsigned int tid) if (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false) return false; - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); if (ck_pr_load_uint(&rw->rw.n_readers) != 0) { ck_pr_store_uint(&rw->rw.writer, 0); diff --git a/include/ck_spinlock.h b/include/ck_spinlock.h index 6b08789..323de5c 100644 --- a/include/ck_spinlock.h +++ b/include/ck_spinlock.h @@ -142,7 +142,7 @@ ck_spinlock_anderson_lock(struct ck_spinlock_anderson *lock, /* Prepare slot for potential re-use by another thread. */ ck_pr_store_uint(&lock->slots[position].locked, true); - ck_pr_fence_store(); + ck_pr_fence_memory(); *slot = lock->slots + position; return; @@ -194,7 +194,7 @@ ck_spinlock_fas_trylock(struct ck_spinlock_fas *lock) if (value == false) ck_pr_fence_memory(); - return (!value); + return !value; } CK_CC_INLINE static bool @@ -268,7 +268,7 @@ ck_spinlock_cas_trylock(struct ck_spinlock_cas *lock) if (value == false) ck_pr_fence_memory(); - return (!value); + return !value; } CK_CC_INLINE static bool @@ -658,9 +658,9 @@ CK_CC_INLINE static bool ck_spinlock_mcs_trylock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *node) { - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); - ck_pr_fence_store(); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); if (ck_pr_cas_ptr(queue, NULL, node) == true) { ck_pr_fence_load(); @@ -686,24 +686,24 @@ ck_spinlock_mcs_lock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *nod * In the case that there is a successor, let them know they must wait * for us to unlock. */ - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); /* * Swap current tail with current lock request. If the swap operation * returns NULL, it means the queue was empty. If the queue was empty, * then the operation is complete. */ - ck_pr_fence_memory(); previous = ck_pr_fas_ptr(queue, node); - if (previous == NULL) - return; - - /* Let the previous lock holder know that we are waiting on them. */ - ck_pr_store_ptr(&previous->next, node); - while (ck_pr_load_uint(&node->locked) == true) - ck_pr_stall(); + if (previous != NULL) { + /* Let the previous lock holder know that we are waiting on them. */ + ck_pr_store_ptr(&previous->next, node); + while (ck_pr_load_uint(&node->locked) == true) + ck_pr_stall(); + } + ck_pr_fence_load(); return; } @@ -712,6 +712,8 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n { struct ck_spinlock_mcs *next; + ck_pr_fence_memory(); + next = ck_pr_load_ptr(&node->next); if (next == NULL) { /* @@ -721,7 +723,6 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n */ if (ck_pr_load_ptr(queue) == node && ck_pr_cas_ptr(queue, node, NULL) == true) { - ck_pr_fence_memory(); return; } @@ -740,9 +741,7 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n } /* Allow the next lock operation to complete. */ - ck_pr_fence_memory(); ck_pr_store_uint(&next->locked, false); - return; } #endif /* CK_F_SPINLOCK_MCS */ diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index 505153d..c5231bd 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -31,9 +31,21 @@ #error Do not include this file directly, use ck_pr.h #endif +#include + +CK_CC_INLINE static void +ck_pr_barrier(void) +{ + + __asm__ __volatile__("" ::: "memory"); + return; +} + +#ifndef CK_F_PR +#define CK_F_PR + #include #include -#include /* * The following represent supported atomic operations. @@ -93,45 +105,32 @@ ck_pr_stall(void) return; } -/* - * Most target architectures do not require this. - */ -CK_CC_INLINE static void -ck_pr_fence_load_depends(void) -{ - - __sync_synchronize(); - return; -} - /* * Load and store fences are equivalent to full fences in the GCC port. */ #define CK_PR_FENCE(T) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ - { \ - __sync_synchronize(); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ { \ __sync_synchronize(); \ } +CK_PR_FENCE(atomic) +CK_PR_FENCE(atomic_atomic) +CK_PR_FENCE(atomic_load) +CK_PR_FENCE(atomic_store) +CK_PR_FENCE(store_atomic) +CK_PR_FENCE(load_atomic) CK_PR_FENCE(load) +CK_PR_FENCE(load_load) +CK_PR_FENCE(load_store) CK_PR_FENCE(store) +CK_PR_FENCE(store_store) +CK_PR_FENCE(store_load) CK_PR_FENCE(memory) #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic compare and swap. */ @@ -275,5 +274,5 @@ CK_PR_UNARY_S(8, uint8_t) #undef CK_PR_UNARY_S #undef CK_PR_UNARY - +#endif /* !CK_F_PR */ #endif /* _CK_PR_GCC_H */ diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index e1f88a6..7a7d0df 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -41,6 +41,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * This bounces the hardware thread from low to medium * priority. I am unsure of the benefits of this approach @@ -55,45 +60,29 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ - -CK_PR_FENCE(load_depends, "") +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } + +CK_PR_FENCE(atomic, "lwsync") +CK_PR_FENCE(atomic_atomic, "lwsync") +CK_PR_FENCE(atomic_store, "lwsync") +CK_PR_FENCE(atomic_load, "sync") +CK_PR_FENCE(store_atomic, "lwsync") +CK_PR_FENCE(load_atomic, "lwsync") CK_PR_FENCE(store, "lwsync") +CK_PR_FENCE(store_store, "lwsync") +CK_PR_FENCE(store_load, "sync") CK_PR_FENCE(load, "lwsync") +CK_PR_FENCE(load_load, "lwsync") +CK_PR_FENCE(load_store, "lwsync") CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index 62aeb7a..2aa145d 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -40,6 +40,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * This bounces the hardware thread from low to medium * priority. I am unsure of the benefits of this approach @@ -54,49 +59,33 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } /* * These are derived from: * http://www.ibm.com/developerworks/systems/articles/powerpc.html */ -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(atomic, "lwsync") +CK_PR_FENCE(atomic_atomic, "lwsync") +CK_PR_FENCE(atomic_store, "lwsync") +CK_PR_FENCE(atomic_load, "sync") +CK_PR_FENCE(store_atomic, "lwsync") +CK_PR_FENCE(load_atomic, "lwsync") CK_PR_FENCE(store, "lwsync") +CK_PR_FENCE(store_store, "lwsync") +CK_PR_FENCE(store_load, "sync") CK_PR_FENCE(load, "lwsync") +CK_PR_FENCE(load_load, "lwsync") +CK_PR_FENCE(load_store, "lwsync") CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index ba2fc41..076e378 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -40,6 +40,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * Order loads at the least. */ @@ -51,51 +56,33 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -/* - * If RMO is forced, then do not assume TSO model. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } + /* - * By default, we will assume TSO model is used on SPARCv9. + * Atomic operations are treated as both load and store + * operations on SPARCv9. */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ - -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(atomic_atomic, "membar #StoreStore") +CK_PR_FENCE(atomic, "membar #StoreStore") +CK_PR_FENCE(atomic_store, "membar #StoreStore") +CK_PR_FENCE(atomic_load, "membar #StoreLoad") +CK_PR_FENCE(store_atomic, "membar #StoreStore") +CK_PR_FENCE(load_atomic, "membar #LoadStore") CK_PR_FENCE(store, "membar #StoreStore") +CK_PR_FENCE(store_store, "membar #StoreStore") +CK_PR_FENCE(store_load, "membar #StoreLoad") CK_PR_FENCE(load, "membar #LoadLoad") +CK_PR_FENCE(load_load, "membar #LoadLoad") +CK_PR_FENCE(load_store, "membar #LoadStore") CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index 38a0485..bbed9bf 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -63,52 +63,29 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -/* - * IA32 has strong memory ordering guarantees, so memory - * fences are enabled if and only if the user specifies that - * that the program will be using non-temporal instructions. - * Otherwise, an optimization barrier is used in order to prevent - * compiler re-ordering of loads and stores across the barrier. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +CK_PR_FENCE(atomic, "sfence") +CK_PR_FENCE(atomic_atomic, "sfence") +CK_PR_FENCE(atomic_store, "sfence") +CK_PR_FENCE(atomic_load, "mfence") +CK_PR_FENCE(store_atomic, "sfence") +CK_PR_FENCE(load_atomic, "mfence") CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(load_load, "lfence") +CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") +CK_PR_FENCE(store_store, "sfence") +CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic fetch-and-store operations. */ diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 84e893b..b0813e4 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -62,52 +62,27 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -/* - * IA32 has strong memory ordering guarantees, so memory - * fences are enabled if and only if the user specifies that - * that the program will be using non-temporal instructions. - * Otherwise, an optimization barrier is used in order to prevent - * compiler re-ordering of loads and stores across the barrier. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +CK_PR_FENCE(atomic_store, "sfence") +CK_PR_FENCE(atomic_load, "mfence") +CK_PR_FENCE(store_atomic, "sfence") +CK_PR_FENCE(load_atomic, "mfence") CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(load_load, "lfence") +CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") +CK_PR_FENCE(store_store, "sfence") +CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic fetch-and-store operations. */ diff --git a/regressions/ck_pr/benchmark/Makefile b/regressions/ck_pr/benchmark/Makefile index f43e792..6b6116e 100644 --- a/regressions/ck_pr/benchmark/Makefile +++ b/regressions/ck_pr/benchmark/Makefile @@ -3,16 +3,16 @@ all: ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 ck_pr_cas_64_2: ck_pr_cas_64_2.c - $(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c + $(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c -lm ck_pr_cas_64: ck_pr_cas_64.c - $(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c + $(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c -lm ck_pr_fas_64: ck_pr_fas_64.c - $(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c + $(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c -lm clean: rm -rf ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 *.dSYM *.exe include ../../../build/regressions.build -CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm +CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE diff --git a/regressions/ck_spinlock/benchmark/Makefile b/regressions/ck_spinlock/benchmark/Makefile index 14bd901..1afeb37 100644 --- a/regressions/ck_spinlock/benchmark/Makefile +++ b/regressions/ck_spinlock/benchmark/Makefile @@ -14,67 +14,67 @@ OBJECTS=ck_ticket.THROUGHPUT ck_ticket.LATENCY \ all: $(OBJECTS) ck_spinlock.THROUGHPUT: ck_spinlock.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c -lm ck_spinlock.LATENCY: ck_spinlock.c - $(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c + $(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c -lm ck_ticket.THROUGHPUT: ck_ticket.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c -lm ck_ticket.LATENCY: ck_ticket.c - $(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c + $(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c -lm ck_mcs.THROUGHPUT: ck_mcs.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c -lm ck_mcs.LATENCY: ck_mcs.c - $(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c + $(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c -lm ck_dec.THROUGHPUT: ck_dec.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c -lm ck_dec.LATENCY: ck_dec.c - $(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c + $(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c -lm ck_cas.THROUGHPUT: ck_cas.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c -lm ck_cas.LATENCY: ck_cas.c - $(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c + $(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c -lm ck_fas.THROUGHPUT: ck_fas.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c -lm ck_fas.LATENCY: ck_fas.c - $(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c + $(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c -lm ck_clh.THROUGHPUT: ck_clh.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c -lm ck_clh.LATENCY: ck_clh.c - $(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c + $(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c -lm linux_spinlock.THROUGHPUT: linux_spinlock.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c -lm linux_spinlock.LATENCY: linux_spinlock.c - $(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c + $(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c -lm ck_ticket_pb.THROUGHPUT: ck_ticket_pb.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c -lm ck_ticket_pb.LATENCY: ck_ticket_pb.c - $(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c + $(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c -lm ck_anderson.THROUGHPUT: ck_anderson.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c -lm ck_anderson.LATENCY: ck_anderson.c - $(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c + $(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c -lm clean: rm -rf *.dSYM *.exe $(OBJECTS) include ../../../build/regressions.build -CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm +CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE