From 44b769963fc0ce724e770e5fb3f52f38307c684b Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 14:23:27 -0400 Subject: [PATCH 01/22] ck_pr: ck_pr_fence_X_Y interface has been added. ck_pr_fence_{load_load,store_store,load_store,store_load} operations have been added. In addition to this, it is no longer the responsibility of architecture ports to determine when to emit a specific fence. Instead, the underlying port will always emit the necessary instructions to enforce strict ordering. The higher-level include/ck_pr implementation will enforce whether or not a fence is necessary to be emitted according to the memory model specified by ck_md (CK_MD_{TSO,RMO,PSO}). In other words, only ck_pr_fence_strict_* is implemented by the MD-ck_pr port. --- include/ck_pr.h | 61 +++++++++++++++++++++++++++++++++++++ include/gcc/ppc/ck_pr.h | 33 ++++++-------------- include/gcc/ppc64/ck_pr.h | 33 ++++++-------------- include/gcc/sparcv9/ck_pr.h | 39 ++++++------------------ include/gcc/x86/ck_pr.h | 29 +++--------------- include/gcc/x86_64/ck_pr.h | 29 +++--------------- 6 files changed, 99 insertions(+), 125 deletions(-) diff --git a/include/ck_pr.h b/include/ck_pr.h index 35540b3..10839a3 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -49,6 +50,66 @@ #error Your platform is unsupported #endif +#define CK_PR_FENCE_EMIT(T) \ + CK_CC_INLINE static void \ + ck_pr_fence_##T(void) \ + { \ + ck_pr_fence_strict_##T(); \ + } +#define CK_PR_FENCE_NOOP(T) \ + CK_CC_INLINE static void \ + ck_pr_fence_##T(void) \ + { \ + return; \ + } + +/* + * None of the currently supported platforms allow for data-dependent + * load ordering. + */ +CK_PR_FENCE_NOOP(load_depends) + +#if defined(CK_MD_RMO) +/* + * Only stores to the same location have a global + * ordering. + */ +CK_PR_FENCE_EMIT(load_load) +CK_PR_FENCE_EMIT(load_store) +CK_PR_FENCE_EMIT(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_EMIT(load) +CK_PR_FENCE_EMIT(store) +CK_PR_FENCE_EMIT(memory) +#elif defined(CK_MD_PSO) +/* + * Anything can be re-ordered with respect to stores. + * Otherwise, loads are executed in-order. + */ +CK_PR_FENCE_NOOP(load_load) +CK_PR_FENCE_EMIT(load_store) +CK_PR_FENCE_EMIT(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_NOOP(load) +CK_PR_FENCE_EMIT(store) +CK_PR_FENCE_EMIT(memory) +#elif defined(CK_MD_TSO) +/* + * Only loads are re-ordered and only with respect to + * prior stores. + */ +CK_PR_FENCE_NOOP(load_load) +CK_PR_FENCE_NOOP(load_store) +CK_PR_FENCE_NOOP(store_store) +CK_PR_FENCE_EMIT(store_load) +CK_PR_FENCE_NOOP(load) +CK_PR_FENCE_NOOP(store) +CK_PR_FENCE_NOOP(memory) +#endif /* CK_MD_TSO */ + +#undef CK_PR_FENCE_EMIT +#undef CK_PR_FENCE_NOOP + #define CK_PR_BIN(K, S, M, T, P, C) \ CK_CC_INLINE static void \ ck_pr_##K##_##S(M *target, T value) \ diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index e1f88a6..8b14772 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -55,33 +55,20 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } CK_PR_FENCE(load_depends, "") CK_PR_FENCE(store, "lwsync") +CK_PR_FENCE(store_store, "lwsync") +CK_PR_FENCE(store_load, "sync") CK_PR_FENCE(load, "lwsync") +CK_PR_FENCE(load_load, "lwsync") +CK_PR_FENCE(load_store, "lwsync") CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index 62aeb7a..aebd2c9 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -54,29 +54,12 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } /* * These are derived from: @@ -84,7 +67,11 @@ ck_pr_stall(void) */ CK_PR_FENCE(load_depends, "") CK_PR_FENCE(store, "lwsync") +CK_PR_FENCE(store_store, "lwsync") +CK_PR_FENCE(store_load, "sync") CK_PR_FENCE(load, "lwsync") +CK_PR_FENCE(load_load, "lwsync") +CK_PR_FENCE(load_store, "lwsync") CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index ba2fc41..fe6991a 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -51,39 +51,20 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) -/* - * If RMO is forced, then do not assume TSO model. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -/* - * By default, we will assume TSO model is used on SPARCv9. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ - } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } CK_PR_FENCE(load_depends, "") CK_PR_FENCE(store, "membar #StoreStore") +CK_PR_FENCE(store_store, "membar #StoreStore") +CK_PR_FENCE(store_load, "membar #StoreLoad") CK_PR_FENCE(load, "membar #LoadLoad") +CK_PR_FENCE(load_load, "membar #LoadLoad") +CK_PR_FENCE(load_store, "membar #LoadStore") CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad") #undef CK_PR_FENCE diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index 38a0485..e0b04c9 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -63,40 +63,19 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -/* - * IA32 has strong memory ordering guarantees, so memory - * fences are enabled if and only if the user specifies that - * that the program will be using non-temporal instructions. - * Otherwise, an optimization barrier is used in order to prevent - * compiler re-ordering of loads and stores across the barrier. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(load_load, "lfence") +CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") +CK_PR_FENCE(store_store, "sfence") +CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 84e893b..004f5e5 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -62,40 +62,19 @@ ck_pr_stall(void) return; } -#if defined(CK_MD_RMO) || defined(CK_MD_PSO) #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } -#else -/* - * IA32 has strong memory ordering guarantees, so memory - * fences are enabled if and only if the user specifies that - * that the program will be using non-temporal instructions. - * Otherwise, an optimization barrier is used in order to prevent - * compiler re-ordering of loads and stores across the barrier. - */ -#define CK_PR_FENCE(T, I) \ - CK_CC_INLINE static void \ - ck_pr_fence_strict_##T(void) \ - { \ - __asm__ __volatile__(I ::: "memory"); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ - { \ - __asm__ __volatile__("" ::: "memory"); \ } -#endif /* !CK_MD_RMO && !CK_MD_PSO */ CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(load_load, "lfence") +CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") +CK_PR_FENCE(store_store, "sfence") +CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE From 137fb4995164dbd64ca9f01720e7d3e7f2f9a569 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 14:39:29 -0400 Subject: [PATCH 02/22] regressions: Fix link order. --- regressions/ck_pr/benchmark/Makefile | 8 ++--- regressions/ck_spinlock/benchmark/Makefile | 42 +++++++++++----------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/regressions/ck_pr/benchmark/Makefile b/regressions/ck_pr/benchmark/Makefile index f43e792..6b6116e 100644 --- a/regressions/ck_pr/benchmark/Makefile +++ b/regressions/ck_pr/benchmark/Makefile @@ -3,16 +3,16 @@ all: ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 ck_pr_cas_64_2: ck_pr_cas_64_2.c - $(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c + $(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c -lm ck_pr_cas_64: ck_pr_cas_64.c - $(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c + $(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c -lm ck_pr_fas_64: ck_pr_fas_64.c - $(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c + $(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c -lm clean: rm -rf ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 *.dSYM *.exe include ../../../build/regressions.build -CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm +CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE diff --git a/regressions/ck_spinlock/benchmark/Makefile b/regressions/ck_spinlock/benchmark/Makefile index 14bd901..1afeb37 100644 --- a/regressions/ck_spinlock/benchmark/Makefile +++ b/regressions/ck_spinlock/benchmark/Makefile @@ -14,67 +14,67 @@ OBJECTS=ck_ticket.THROUGHPUT ck_ticket.LATENCY \ all: $(OBJECTS) ck_spinlock.THROUGHPUT: ck_spinlock.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c -lm ck_spinlock.LATENCY: ck_spinlock.c - $(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c + $(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c -lm ck_ticket.THROUGHPUT: ck_ticket.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c -lm ck_ticket.LATENCY: ck_ticket.c - $(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c + $(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c -lm ck_mcs.THROUGHPUT: ck_mcs.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c -lm ck_mcs.LATENCY: ck_mcs.c - $(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c + $(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c -lm ck_dec.THROUGHPUT: ck_dec.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c -lm ck_dec.LATENCY: ck_dec.c - $(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c + $(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c -lm ck_cas.THROUGHPUT: ck_cas.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c -lm ck_cas.LATENCY: ck_cas.c - $(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c + $(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c -lm ck_fas.THROUGHPUT: ck_fas.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c -lm ck_fas.LATENCY: ck_fas.c - $(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c + $(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c -lm ck_clh.THROUGHPUT: ck_clh.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c -lm ck_clh.LATENCY: ck_clh.c - $(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c + $(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c -lm linux_spinlock.THROUGHPUT: linux_spinlock.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c -lm linux_spinlock.LATENCY: linux_spinlock.c - $(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c + $(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c -lm ck_ticket_pb.THROUGHPUT: ck_ticket_pb.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c -lm ck_ticket_pb.LATENCY: ck_ticket_pb.c - $(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c + $(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c -lm ck_anderson.THROUGHPUT: ck_anderson.c - $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c + $(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c -lm ck_anderson.LATENCY: ck_anderson.c - $(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c + $(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c -lm clean: rm -rf *.dSYM *.exe $(OBJECTS) include ../../../build/regressions.build -CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm +CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE From 8311e9fcb42ba723355050fb5b899a4c329c60e0 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 14:41:03 -0400 Subject: [PATCH 03/22] ck_pr: Update GCC port and change ck_pr dependency path. The compiler-specific ck_pr is now unconditionally included. It currently implements things like compiler barriers. --- include/gcc/ck_pr.h | 31 +++++++++++++++++-------------- include/gcc/ppc/ck_pr.h | 5 +++++ include/gcc/ppc64/ck_pr.h | 5 +++++ include/gcc/sparcv9/ck_pr.h | 5 +++++ 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index 505153d..aa92eb4 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -31,9 +31,20 @@ #error Do not include this file directly, use ck_pr.h #endif +#include + +CK_CC_INLINE +static void ck_pr_barrier(void) +{ + + __asm__ __volatile__("" ::: "memory"); + return; +} + +#ifndef CK_F_PR +#define CK_F_PR #include #include -#include /* * The following represent supported atomic operations. @@ -110,28 +121,20 @@ ck_pr_fence_load_depends(void) #define CK_PR_FENCE(T) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ - { \ - __sync_synchronize(); \ - } \ - CK_CC_INLINE static void ck_pr_fence_##T(void) \ { \ __sync_synchronize(); \ } CK_PR_FENCE(load) +CK_PR_FENCE(load_load) +CK_PR_FENCE(load_store) CK_PR_FENCE(store) +CK_PR_FENCE(store_store) +CK_PR_FENCE(store_load) CK_PR_FENCE(memory) #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic compare and swap. */ @@ -275,5 +278,5 @@ CK_PR_UNARY_S(8, uint8_t) #undef CK_PR_UNARY_S #undef CK_PR_UNARY - +#endif /* !CK_F_PR */ #endif /* _CK_PR_GCC_H */ diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index 8b14772..1c53171 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -41,6 +41,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * This bounces the hardware thread from low to medium * priority. I am unsure of the benefits of this approach diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index aebd2c9..a99e89c 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -40,6 +40,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * This bounces the hardware thread from low to medium * priority. I am unsure of the benefits of this approach diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index fe6991a..ac2d243 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -40,6 +40,11 @@ */ #include "ck_f_pr.h" +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + /* * Order loads at the least. */ From cc8c3fb2db2c370adb31ef74a68437257bbef70a Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 14:44:25 -0400 Subject: [PATCH 04/22] ck_pr/gcc: Fix style issue. --- include/gcc/ck_pr.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index aa92eb4..b88b177 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -33,8 +33,8 @@ #include -CK_CC_INLINE -static void ck_pr_barrier(void) +CK_CC_INLINE static void +ck_pr_barrier(void) { __asm__ __volatile__("" ::: "memory"); @@ -43,6 +43,7 @@ static void ck_pr_barrier(void) #ifndef CK_F_PR #define CK_F_PR + #include #include From 5506ad2744cfe99d0e2a76ebfd9a8b6b28ff3d38 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 14:48:52 -0400 Subject: [PATCH 05/22] ck_pr: Move ck_pr_barrier to compiler port. --- include/ck_pr.h | 8 +++++--- include/gcc/ppc/ck_pr.h | 8 -------- include/gcc/ppc64/ck_pr.h | 8 -------- include/gcc/sparcv9/ck_pr.h | 8 -------- include/gcc/x86/ck_pr.h | 8 -------- include/gcc/x86_64/ck_pr.h | 8 -------- 6 files changed, 5 insertions(+), 43 deletions(-) diff --git a/include/ck_pr.h b/include/ck_pr.h index 10839a3..c3007f2 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -44,12 +44,14 @@ #include "gcc/ppc64/ck_pr.h" #elif defined(__ppc__) #include "gcc/ppc/ck_pr.h" -#elif defined(__GNUC__) -#include "gcc/ck_pr.h" -#else +#elif !defined(__GNUC__) #error Your platform is unsupported #endif +#if defined(__GNUC__) +#include "gcc/ck_pr.h" +#endif + #define CK_PR_FENCE_EMIT(T) \ CK_CC_INLINE static void \ ck_pr_fence_##T(void) \ diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index 1c53171..0b9796f 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -78,14 +78,6 @@ CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index a99e89c..0fb688a 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -81,14 +81,6 @@ CK_PR_FENCE(memory, "sync") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index ac2d243..b92c751 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -74,14 +74,6 @@ CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - #define CK_PR_LOAD(S, M, T, C, I) \ CK_CC_INLINE static T \ ck_pr_load_##S(const M *target) \ diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index e0b04c9..7c058db 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -80,14 +80,6 @@ CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic fetch-and-store operations. */ diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 004f5e5..89b4238 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -79,14 +79,6 @@ CK_PR_FENCE(memory, "mfence") #undef CK_PR_FENCE -CK_CC_INLINE static void -ck_pr_barrier(void) -{ - - __asm__ __volatile__("" ::: "memory"); - return; -} - /* * Atomic fetch-and-store operations. */ From b025722fbd5789840332b30327b73ac71d651443 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 15:44:38 -0400 Subject: [PATCH 06/22] ck_brlock: Migrate to ck_pr_fence_X_Y interface. --- include/ck_brlock.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/ck_brlock.h b/include/ck_brlock.h index 7b1f27d..913a745 100644 --- a/include/ck_brlock.h +++ b/include/ck_brlock.h @@ -193,10 +193,9 @@ ck_brlock_read_lock(struct ck_brlock *br, struct ck_brlock_reader *reader) /* Serialize counter update with respect to writer snapshot. */ ck_pr_fence_memory(); #else - ck_pr_store_uint(&reader->n_readers, 1); - /* Loads can be re-ordered before previous stores, even on TSO. */ - ck_pr_fence_strict_memory(); + ck_pr_store_uint(&reader->n_readers, 1); + ck_pr_fence_store_load(); #endif if (ck_pr_load_uint(&br->writer) == false) @@ -229,10 +228,9 @@ ck_brlock_read_trylock(struct ck_brlock *br, ck_pr_stall(); } - ck_pr_store_uint(&reader->n_readers, 1); - /* Loads are re-ordered with respect to prior stores. */ - ck_pr_fence_strict_memory(); + ck_pr_store_uint(&reader->n_readers, 1); + ck_pr_fence_store_load(); if (ck_pr_load_uint(&br->writer) == false) break; From 01f89ee691ae38162c24c7996d83db9d73811bc7 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 15:47:12 -0400 Subject: [PATCH 07/22] ck_hp_fifo: Migrate to ck_pr_fence_X_Y functions. --- include/ck_hp_fifo.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/ck_hp_fifo.h b/include/ck_hp_fifo.h index 41064f3..8c0d08b 100644 --- a/include/ck_hp_fifo.h +++ b/include/ck_hp_fifo.h @@ -81,7 +81,7 @@ ck_hp_fifo_enqueue_mpmc(ck_hp_record_t *record, for (;;) { tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, tail); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (tail != ck_pr_load_ptr(&fifo->tail)) continue; @@ -112,7 +112,7 @@ ck_hp_fifo_tryenqueue_mpmc(ck_hp_record_t *record, tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, tail); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (tail != ck_pr_load_ptr(&fifo->tail)) return false; @@ -140,13 +140,13 @@ ck_hp_fifo_dequeue_mpmc(ck_hp_record_t *record, ck_pr_fence_load(); tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, head); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) continue; next = ck_pr_load_ptr(&head->next); ck_hp_set(record, 1, next); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) continue; @@ -175,13 +175,13 @@ ck_hp_fifo_trydequeue_mpmc(ck_hp_record_t *record, ck_pr_fence_load(); tail = ck_pr_load_ptr(&fifo->tail); ck_hp_set(record, 0, head); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) return NULL; next = ck_pr_load_ptr(&head->next); ck_hp_set(record, 1, next); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (head != ck_pr_load_ptr(&fifo->head)) return NULL; From fe7e5ac5b1ce077c538b6db0c3fa6d4d1b5d6266 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 15:47:28 -0400 Subject: [PATCH 08/22] ck_hp_stack: Migrate to ck_pr_fence_X_Y functions. --- include/ck_hp_stack.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ck_hp_stack.h b/include/ck_hp_stack.h index 2a7856c..7ac8821 100644 --- a/include/ck_hp_stack.h +++ b/include/ck_hp_stack.h @@ -62,7 +62,7 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target) return NULL; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); } while (entry != ck_pr_load_ptr(&target->head)); while (ck_pr_cas_ptr_value(&target->head, entry, entry->next, &entry) == false) { @@ -70,11 +70,11 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target) return NULL; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); update = ck_pr_load_ptr(&target->head); while (entry != update) { ck_hp_set(record, 0, update); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); entry = update; update = ck_pr_load_ptr(&target->head); if (update == NULL) @@ -95,7 +95,7 @@ ck_hp_stack_trypop_mpmc(ck_hp_record_t *record, struct ck_stack *target, struct return false; ck_hp_set(record, 0, entry); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); if (entry != ck_pr_load_ptr(&target->head)) goto leave; From 83bc7f9f549c6a46f7572467be19d1187216860a Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 15:47:55 -0400 Subject: [PATCH 09/22] ck_epoch: Migrate to ck_pr_fence_X_Y. --- include/ck_epoch.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/ck_epoch.h b/include/ck_epoch.h index 4624bdf..c300a11 100644 --- a/include/ck_epoch.h +++ b/include/ck_epoch.h @@ -97,12 +97,11 @@ ck_epoch_begin(ck_epoch_t *epoch, ck_epoch_record_t *record) /* * It is possible for loads to be re-ordered before the store * is committed into the caller's epoch and active fields. - * Execute a full barrier to serialize stores with respect to - * loads + * For this reason, store to load serialization is necessary. */ ck_pr_store_uint(&record->epoch, g_epoch); ck_pr_store_uint(&record->active, 1); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); return; } From f87e0caf99882f4b6aa6895182147ad3170187f3 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sat, 11 May 2013 16:22:44 -0400 Subject: [PATCH 10/22] ck_bytelock: Migrate to ck_pr_fence_X_Y. --- include/ck_bytelock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_bytelock.h b/include/ck_bytelock.h index f73adb2..d2f552e 100644 --- a/include/ck_bytelock.h +++ b/include/ck_bytelock.h @@ -93,7 +93,7 @@ ck_bytelock_write_lock(struct ck_bytelock *bytelock, unsigned int slot) ck_pr_store_8(&bytelock->readers[slot - 1], false); /* Wait for slotted readers to drain out. */ - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); for (i = 0; i < sizeof(bytelock->readers) / CK_BYTELOCK_LENGTH; i++) { while (CK_BYTELOCK_LOAD((CK_BYTELOCK_TYPE *)&readers[i]) != false) ck_pr_stall(); @@ -150,7 +150,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot) slot -= 1; for (;;) { ck_pr_store_8(&bytelock->readers[slot], true); - ck_pr_fence_strict_memory(); + ck_pr_fence_store_load(); /* * If there is no owner at this point, our slot has From 8face51e0d8d0ebb4935f57382dadb88d23d35a7 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:11:07 -0400 Subject: [PATCH 11/22] ck_spinlock: Acquire semantics for anderson spinlock. --- include/ck_spinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck_spinlock.h b/include/ck_spinlock.h index 6b08789..416cc1c 100644 --- a/include/ck_spinlock.h +++ b/include/ck_spinlock.h @@ -142,7 +142,7 @@ ck_spinlock_anderson_lock(struct ck_spinlock_anderson *lock, /* Prepare slot for potential re-use by another thread. */ ck_pr_store_uint(&lock->slots[position].locked, true); - ck_pr_fence_store(); + ck_pr_fence_memory(); *slot = lock->slots + position; return; From d1dd6611acbf8d159961a447ee38b4f8355b0191 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:37:18 -0400 Subject: [PATCH 12/22] ck_pr: Add ck_pr_fence_atomic interface. These operations serialize atomic-RMW operations with respect to each other, loads and stores. In addition to this, the load_depends implementations have been removed. --- include/ck_pr.h | 20 +++++++++++++++++++- include/gcc/ck_pr.h | 6 ++++++ include/gcc/ppc/ck_pr.h | 5 ++++- include/gcc/ppc64/ck_pr.h | 5 ++++- include/gcc/sparcv9/ck_pr.h | 9 ++++++++- include/gcc/x86/ck_pr.h | 4 ++++ include/gcc/x86_64/ck_pr.h | 4 ++++ 7 files changed, 49 insertions(+), 4 deletions(-) diff --git a/include/ck_pr.h b/include/ck_pr.h index c3007f2..43a0b1c 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -76,6 +76,12 @@ CK_PR_FENCE_NOOP(load_depends) * Only stores to the same location have a global * ordering. */ +CK_PR_FENCE_EMIT(atomic) +CK_PR_FENCE_EMIT(atomic_atomic) +CK_PR_FENCE_EMIT(atomic_load) +CK_PR_FENCE_EMIT(atomic_store) +CK_PR_FENCE_EMIT(store_atomic) +CK_PR_FENCE_EMIT(load_atomic) CK_PR_FENCE_EMIT(load_load) CK_PR_FENCE_EMIT(load_store) CK_PR_FENCE_EMIT(store_store) @@ -88,6 +94,12 @@ CK_PR_FENCE_EMIT(memory) * Anything can be re-ordered with respect to stores. * Otherwise, loads are executed in-order. */ +CK_PR_FENCE_EMIT(atomic) +CK_PR_FENCE_EMIT(atomic_atomic) +CK_PR_FENCE_NOOP(atomic_load) +CK_PR_FENCE_EMIT(atomic_store) +CK_PR_FENCE_EMIT(store_atomic) +CK_PR_FENCE_NOOP(load_atomic) CK_PR_FENCE_NOOP(load_load) CK_PR_FENCE_EMIT(load_store) CK_PR_FENCE_EMIT(store_store) @@ -98,8 +110,14 @@ CK_PR_FENCE_EMIT(memory) #elif defined(CK_MD_TSO) /* * Only loads are re-ordered and only with respect to - * prior stores. + * prior stores. Atomic operations are serializing. */ +CK_PR_FENCE_NOOP(atomic) +CK_PR_FENCE_NOOP(atomic_atomic) +CK_PR_FENCE_NOOP(atomic_load) +CK_PR_FENCE_NOOP(atomic_store) +CK_PR_FENCE_NOOP(store_atomic) +CK_PR_FENCE_NOOP(load_atomic) CK_PR_FENCE_NOOP(load_load) CK_PR_FENCE_NOOP(load_store) CK_PR_FENCE_NOOP(store_store) diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index b88b177..196d267 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -126,6 +126,12 @@ ck_pr_fence_load_depends(void) __sync_synchronize(); \ } +CK_PR_FENCE(atomic) +CK_PR_FENCE(atomic_atomic) +CK_PR_FENCE(atomic_load) +CK_PR_FENCE(atomic_store) +CK_PR_FENCE(store_atomic) +CK_PR_FENCE(load_atomic) CK_PR_FENCE(load) CK_PR_FENCE(load_load) CK_PR_FENCE(load_store) diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index 0b9796f..c82e217 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -67,7 +67,10 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(atomic_store, "lwsync") +CK_PR_FENCE(atomic_load, "sync") +CK_PR_FENCE(store_atomic, "lwsync") +CK_PR_FENCE(load_atomic, "lwsync") CK_PR_FENCE(store, "lwsync") CK_PR_FENCE(store_store, "lwsync") CK_PR_FENCE(store_load, "sync") diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index 0fb688a..457efda 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -70,7 +70,10 @@ ck_pr_stall(void) * These are derived from: * http://www.ibm.com/developerworks/systems/articles/powerpc.html */ -CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(atomic_store, "lwsync") +CK_PR_FENCE(atomic_load, "sync") +CK_PR_FENCE(store_atomic, "lwsync") +CK_PR_FENCE(load_atomic, "lwsync") CK_PR_FENCE(store, "lwsync") CK_PR_FENCE(store_store, "lwsync") CK_PR_FENCE(store_load, "sync") diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index b92c751..29b9f9c 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -63,7 +63,14 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } -CK_PR_FENCE(load_depends, "") +/* + * Atomic operations are treated as both load and store + * operations on SPARCv9. + */ +CK_PR_FENCE(atomic_store, "membar #StoreStore") +CK_PR_FENCE(atomic_load, "membar #StoreLoad") +CK_PR_FENCE(store_atomic, "membar #StoreStore") +CK_PR_FENCE(load_atomic, "membar #LoadStore") CK_PR_FENCE(store, "membar #StoreStore") CK_PR_FENCE(store_store, "membar #StoreStore") CK_PR_FENCE(store_load, "membar #StoreLoad") diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index 7c058db..eed49ba 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -70,6 +70,10 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } +CK_PR_FENCE(atomic_store, "sfence") +CK_PR_FENCE(atomic_load, "mfence") +CK_PR_FENCE(store_atomic, "sfence") +CK_PR_FENCE(load_atomic, "mfence") CK_PR_FENCE(load, "lfence") CK_PR_FENCE(load_load, "lfence") CK_PR_FENCE(load_store, "mfence") diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 89b4238..b0813e4 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -69,6 +69,10 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } +CK_PR_FENCE(atomic_store, "sfence") +CK_PR_FENCE(atomic_load, "mfence") +CK_PR_FENCE(store_atomic, "sfence") +CK_PR_FENCE(load_atomic, "mfence") CK_PR_FENCE(load, "lfence") CK_PR_FENCE(load_load, "lfence") CK_PR_FENCE(load_store, "mfence") From a4220f2377b9c1a4d6a5d32f7878c7bd909d18ab Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:40:02 -0400 Subject: [PATCH 13/22] ck_pr: Remove ck_pr_fence_load_depends from GCC port. --- include/ck_pr.h | 2 ++ include/gcc/ck_pr.h | 11 ----------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/ck_pr.h b/include/ck_pr.h index 43a0b1c..2577714 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -70,6 +70,8 @@ * load ordering. */ CK_PR_FENCE_NOOP(load_depends) +#define ck_pr_fence_strict_load_depends ck_pr_fence_load_depends + #if defined(CK_MD_RMO) /* diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index 196d267..c5231bd 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -105,17 +105,6 @@ ck_pr_stall(void) return; } -/* - * Most target architectures do not require this. - */ -CK_CC_INLINE static void -ck_pr_fence_load_depends(void) -{ - - __sync_synchronize(); - return; -} - /* * Load and store fences are equivalent to full fences in the GCC port. */ From ffd22e57b80dbf486bb97b0a2331d97a68e7b984 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:57:50 -0400 Subject: [PATCH 14/22] ck_bytelock: Use ck_pr_fence_atomic. --- include/ck_bytelock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck_bytelock.h b/include/ck_bytelock.h index d2f552e..9d42393 100644 --- a/include/ck_bytelock.h +++ b/include/ck_bytelock.h @@ -134,7 +134,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot) if (slot > sizeof bytelock->readers) { for (;;) { ck_pr_inc_uint(&bytelock->n_readers); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); if (ck_pr_load_uint(&bytelock->owner) == 0) break; ck_pr_dec_uint(&bytelock->n_readers); From 3f06a4e23afbe85973f7fbac174c9a1d212ba336 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:58:48 -0400 Subject: [PATCH 15/22] ck_rwlock: Use ck_pr_fence_atomic. --- include/ck_rwlock.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/include/ck_rwlock.h b/include/ck_rwlock.h index 45593b0..81587ac 100644 --- a/include/ck_rwlock.h +++ b/include/ck_rwlock.h @@ -74,7 +74,8 @@ ck_rwlock_write_trylock(ck_rwlock_t *rw) if (ck_pr_fas_uint(&rw->writer, 1) != 0) return false; - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); + if (ck_pr_load_uint(&rw->n_readers) != 0) { ck_rwlock_write_unlock(rw); return false; @@ -90,7 +91,7 @@ ck_rwlock_write_lock(ck_rwlock_t *rw) while (ck_pr_fas_uint(&rw->writer, 1) != 0) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); while (ck_pr_load_uint(&rw->n_readers) != 0) ck_pr_stall(); @@ -111,16 +112,15 @@ ck_rwlock_read_trylock(ck_rwlock_t *rw) * Serialize with respect to concurrent write * lock operation. */ - ck_pr_fence_memory(); - if (ck_pr_load_uint(&rw->writer) == 0) - goto leave; + ck_pr_fence_atomic_load(); + + if (ck_pr_load_uint(&rw->writer) == 0) { + ck_pr_fence_load(); + return true; + } + ck_pr_dec_uint(&rw->n_readers); return false; - -leave: - /* Acquire semantics are necessary. */ - ck_pr_fence_load(); - return true; } CK_CC_INLINE static void @@ -137,7 +137,8 @@ ck_rwlock_read_lock(ck_rwlock_t *rw) * Serialize with respect to concurrent write * lock operation. */ - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); + if (ck_pr_load_uint(&rw->writer) == 0) break; ck_pr_dec_uint(&rw->n_readers); @@ -180,7 +181,7 @@ ck_rwlock_recursive_write_lock(ck_rwlock_recursive_t *rw, unsigned int tid) while (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); while (ck_pr_load_uint(&rw->rw.n_readers) != 0) ck_pr_stall(); @@ -202,7 +203,7 @@ ck_rwlock_recursive_write_trylock(ck_rwlock_recursive_t *rw, unsigned int tid) if (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false) return false; - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); if (ck_pr_load_uint(&rw->rw.n_readers) != 0) { ck_pr_store_uint(&rw->rw.writer, 0); From 8320a4a6f43e05a6d851647cc088f8e2afadd91c Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 16:59:36 -0400 Subject: [PATCH 16/22] ck_pr: Comment elaboration on ck_pr_fence_atomic semantics. More specifically, note that in memory models where atomic operations do not have serializing effects that atomic read-modify-write operations are modeled as store operations. --- include/ck_pr.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/ck_pr.h b/include/ck_pr.h index 2577714..e7c98f7 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -72,7 +72,10 @@ CK_PR_FENCE_NOOP(load_depends) #define ck_pr_fence_strict_load_depends ck_pr_fence_load_depends - +/* + * In memory models where atomic operations do not have serializing + * effects, atomic read-modify-write operations are modeled as stores. + */ #if defined(CK_MD_RMO) /* * Only stores to the same location have a global From 65f24e88609d66ab0cc3c4c635f2466ca30596db Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 17:01:24 -0400 Subject: [PATCH 17/22] git: Add ck_stailq regression to gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8ce486d..9cc33eb 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ regressions/ck_brlock/benchmark/throughput regressions/ck_rwlock/benchmark/throughput regressions/ck_queue/validate/ck_list regressions/ck_queue/validate/ck_slist +regressions/ck_queue/validate/ck_stailq regressions/ck_cohort/validate/validate regressions/ck_cohort/benchmark/ck_cohort.LATENCY regressions/ck_cohort/benchmark/ck_cohort.THROUGHPUT From 214d7aed66b0eee7dc7061440215f27b59504259 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Sun, 12 May 2013 17:05:27 -0400 Subject: [PATCH 18/22] ck_pr: Implement ck_pr_fence_atomic in MD ck_pr. --- include/gcc/ppc/ck_pr.h | 2 ++ include/gcc/ppc64/ck_pr.h | 2 ++ include/gcc/sparcv9/ck_pr.h | 2 ++ include/gcc/x86/ck_pr.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index c82e217..7a7d0df 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -67,6 +67,8 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } +CK_PR_FENCE(atomic, "lwsync") +CK_PR_FENCE(atomic_atomic, "lwsync") CK_PR_FENCE(atomic_store, "lwsync") CK_PR_FENCE(atomic_load, "sync") CK_PR_FENCE(store_atomic, "lwsync") diff --git a/include/gcc/ppc64/ck_pr.h b/include/gcc/ppc64/ck_pr.h index 457efda..2aa145d 100644 --- a/include/gcc/ppc64/ck_pr.h +++ b/include/gcc/ppc64/ck_pr.h @@ -70,6 +70,8 @@ ck_pr_stall(void) * These are derived from: * http://www.ibm.com/developerworks/systems/articles/powerpc.html */ +CK_PR_FENCE(atomic, "lwsync") +CK_PR_FENCE(atomic_atomic, "lwsync") CK_PR_FENCE(atomic_store, "lwsync") CK_PR_FENCE(atomic_load, "sync") CK_PR_FENCE(store_atomic, "lwsync") diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index 29b9f9c..076e378 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -67,6 +67,8 @@ ck_pr_stall(void) * Atomic operations are treated as both load and store * operations on SPARCv9. */ +CK_PR_FENCE(atomic_atomic, "membar #StoreStore") +CK_PR_FENCE(atomic, "membar #StoreStore") CK_PR_FENCE(atomic_store, "membar #StoreStore") CK_PR_FENCE(atomic_load, "membar #StoreLoad") CK_PR_FENCE(store_atomic, "membar #StoreStore") diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index eed49ba..bbed9bf 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -70,6 +70,8 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } +CK_PR_FENCE(atomic, "sfence") +CK_PR_FENCE(atomic_atomic, "sfence") CK_PR_FENCE(atomic_store, "sfence") CK_PR_FENCE(atomic_load, "mfence") CK_PR_FENCE(store_atomic, "sfence") From 08d13deaf47eb1f4ac7d36b46944afca5ce435e4 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Mon, 13 May 2013 14:36:01 -0400 Subject: [PATCH 19/22] ck_brlock: Migrate to ck_pr_fence_X_Y. --- include/ck_brlock.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/include/ck_brlock.h b/include/ck_brlock.h index 913a745..4246c7d 100644 --- a/include/ck_brlock.h +++ b/include/ck_brlock.h @@ -83,7 +83,7 @@ ck_brlock_write_lock(struct ck_brlock *br) while (ck_pr_fas_uint(&br->writer, true) == true) ck_pr_stall(); - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); /* The reader list is protected under the writer br. */ for (cursor = br->readers; cursor != NULL; cursor = cursor->next) { @@ -121,7 +121,7 @@ ck_brlock_write_trylock(struct ck_brlock *br, unsigned int factor) * We do not require a strict fence here as atomic RMW operations * are serializing. */ - ck_pr_fence_memory(); + ck_pr_fence_atomic_load(); for (cursor = br->readers; cursor != NULL; cursor = cursor->next) { while (ck_pr_load_uint(&cursor->n_readers) != 0) { @@ -190,11 +190,18 @@ ck_brlock_read_lock(struct ck_brlock *br, struct ck_brlock_reader *reader) #if defined(__x86__) || defined(__x86_64__) ck_pr_fas_uint(&reader->n_readers, 1); - /* Serialize counter update with respect to writer snapshot. */ - ck_pr_fence_memory(); + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_atomic_load(); #else - /* Loads can be re-ordered before previous stores, even on TSO. */ ck_pr_store_uint(&reader->n_readers, 1); + + /* + * Serialize reader counter update with respect to load of + * writer. + */ ck_pr_fence_store_load(); #endif @@ -228,9 +235,23 @@ ck_brlock_read_trylock(struct ck_brlock *br, ck_pr_stall(); } - /* Loads are re-ordered with respect to prior stores. */ +#if defined(__x86__) || defined(__x86_64__) + ck_pr_fas_uint(&reader->n_readers, 1); + + /* + * Serialize reader counter update with respect to load of + * writer. + */ + ck_pr_fence_atomic_load(); +#else ck_pr_store_uint(&reader->n_readers, 1); + + /* + * Serialize reader counter update with respect to load of + * writer. + */ ck_pr_fence_store_load(); +#endif if (ck_pr_load_uint(&br->writer) == false) break; From 8540821f3faf5db0056d2c7808ee8ae42605ac18 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Mon, 13 May 2013 15:46:51 -0400 Subject: [PATCH 20/22] ck_spinlock: Minor style changes to return statement. --- include/ck_spinlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_spinlock.h b/include/ck_spinlock.h index 416cc1c..6900c09 100644 --- a/include/ck_spinlock.h +++ b/include/ck_spinlock.h @@ -194,7 +194,7 @@ ck_spinlock_fas_trylock(struct ck_spinlock_fas *lock) if (value == false) ck_pr_fence_memory(); - return (!value); + return !value; } CK_CC_INLINE static bool @@ -268,7 +268,7 @@ ck_spinlock_cas_trylock(struct ck_spinlock_cas *lock) if (value == false) ck_pr_fence_memory(); - return (!value); + return !value; } CK_CC_INLINE static bool From b43832c384ffb8555e461ca5a8ea27d1b00417f4 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Mon, 13 May 2013 16:24:22 -0400 Subject: [PATCH 21/22] ck_spinlock: Migrate MCS to lighter-weight fast path. --- include/ck_spinlock.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/include/ck_spinlock.h b/include/ck_spinlock.h index 6900c09..323de5c 100644 --- a/include/ck_spinlock.h +++ b/include/ck_spinlock.h @@ -658,9 +658,9 @@ CK_CC_INLINE static bool ck_spinlock_mcs_trylock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *node) { - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); - ck_pr_fence_store(); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); if (ck_pr_cas_ptr(queue, NULL, node) == true) { ck_pr_fence_load(); @@ -686,24 +686,24 @@ ck_spinlock_mcs_lock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *nod * In the case that there is a successor, let them know they must wait * for us to unlock. */ - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); /* * Swap current tail with current lock request. If the swap operation * returns NULL, it means the queue was empty. If the queue was empty, * then the operation is complete. */ - ck_pr_fence_memory(); previous = ck_pr_fas_ptr(queue, node); - if (previous == NULL) - return; - - /* Let the previous lock holder know that we are waiting on them. */ - ck_pr_store_ptr(&previous->next, node); - while (ck_pr_load_uint(&node->locked) == true) - ck_pr_stall(); + if (previous != NULL) { + /* Let the previous lock holder know that we are waiting on them. */ + ck_pr_store_ptr(&previous->next, node); + while (ck_pr_load_uint(&node->locked) == true) + ck_pr_stall(); + } + ck_pr_fence_load(); return; } @@ -712,6 +712,8 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n { struct ck_spinlock_mcs *next; + ck_pr_fence_memory(); + next = ck_pr_load_ptr(&node->next); if (next == NULL) { /* @@ -721,7 +723,6 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n */ if (ck_pr_load_ptr(queue) == node && ck_pr_cas_ptr(queue, node, NULL) == true) { - ck_pr_fence_memory(); return; } @@ -740,9 +741,7 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n } /* Allow the next lock operation to complete. */ - ck_pr_fence_memory(); ck_pr_store_uint(&next->locked, false); - return; } #endif /* CK_F_SPINLOCK_MCS */ From 2ba3f5937447133391199b3ff1bbc0cffde001c0 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Mon, 13 May 2013 16:24:22 -0400 Subject: [PATCH 22/22] ck_spinlock: Migrate MCS to ck_pr_fence_X_Y. This includes fixing acquire semantics on mcs_lock fast path. This represents an additional fence on the fast path for acquire semantics post-acquisition. --- include/ck_spinlock.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/include/ck_spinlock.h b/include/ck_spinlock.h index 6900c09..323de5c 100644 --- a/include/ck_spinlock.h +++ b/include/ck_spinlock.h @@ -658,9 +658,9 @@ CK_CC_INLINE static bool ck_spinlock_mcs_trylock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *node) { - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); - ck_pr_fence_store(); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); if (ck_pr_cas_ptr(queue, NULL, node) == true) { ck_pr_fence_load(); @@ -686,24 +686,24 @@ ck_spinlock_mcs_lock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *nod * In the case that there is a successor, let them know they must wait * for us to unlock. */ - ck_pr_store_uint(&node->locked, true); - ck_pr_store_ptr(&node->next, NULL); + node->locked = true; + node->next = NULL; + ck_pr_fence_store_atomic(); /* * Swap current tail with current lock request. If the swap operation * returns NULL, it means the queue was empty. If the queue was empty, * then the operation is complete. */ - ck_pr_fence_memory(); previous = ck_pr_fas_ptr(queue, node); - if (previous == NULL) - return; - - /* Let the previous lock holder know that we are waiting on them. */ - ck_pr_store_ptr(&previous->next, node); - while (ck_pr_load_uint(&node->locked) == true) - ck_pr_stall(); + if (previous != NULL) { + /* Let the previous lock holder know that we are waiting on them. */ + ck_pr_store_ptr(&previous->next, node); + while (ck_pr_load_uint(&node->locked) == true) + ck_pr_stall(); + } + ck_pr_fence_load(); return; } @@ -712,6 +712,8 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n { struct ck_spinlock_mcs *next; + ck_pr_fence_memory(); + next = ck_pr_load_ptr(&node->next); if (next == NULL) { /* @@ -721,7 +723,6 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n */ if (ck_pr_load_ptr(queue) == node && ck_pr_cas_ptr(queue, node, NULL) == true) { - ck_pr_fence_memory(); return; } @@ -740,9 +741,7 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n } /* Allow the next lock operation to complete. */ - ck_pr_fence_memory(); ck_pr_store_uint(&next->locked, false); - return; } #endif /* CK_F_SPINLOCK_MCS */