diff --git a/include/ck_pr.h b/include/ck_pr.h index c66f95b..ee886f7 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -35,8 +35,6 @@ #if defined(__x86_64__) #include "gcc/x86_64/ck_pr.h" -#elif defined(__x86__) -#include "gcc/x86/ck_pr.h" #elif defined(__sparcv9__) #include "gcc/sparcv9/ck_pr.h" #elif defined(__GNUC__) diff --git a/include/gcc/x86/ck_f_pr.h b/include/gcc/x86/ck_f_pr.h index 0f0ce4b..8894c74 100644 --- a/include/gcc/x86/ck_f_pr.h +++ b/include/gcc/x86/ck_f_pr.h @@ -37,6 +37,8 @@ #define CK_F_PR_CAS_32_2_VALUE #define CK_F_PR_CAS_32_VALUE #define CK_F_PR_CAS_64 +#define CK_F_PR_CAS_64_1 +#define CK_F_PR_CAS_64_1_VALUE #define CK_F_PR_CAS_64_VALUE #define CK_F_PR_CAS_8 #define CK_F_PR_CAS_8_8 @@ -86,6 +88,14 @@ #define CK_F_PR_FAS_INT #define CK_F_PR_FAS_PTR #define CK_F_PR_FAS_UINT +#define CK_F_PR_FENCE_LOAD +#define CK_F_PR_FENCE_LOAD_DEPENDS +#define CK_F_PR_FENCE_MEMORY +#define CK_F_PR_FENCE_STORE +#define CK_F_PR_FENCE_STRICT_LOAD +#define CK_F_PR_FENCE_STRICT_LOAD_DEPENDS +#define CK_F_PR_FENCE_STRICT_MEMORY +#define CK_F_PR_FENCE_STRICT_STORE #define CK_F_PR_INC_16 #define CK_F_PR_INC_16_ZERO #define CK_F_PR_INC_32 @@ -142,9 +152,9 @@ #define CK_F_PR_OR_INT #define CK_F_PR_OR_PTR #define CK_F_PR_OR_UINT +#define CK_F_PR_STALL #define CK_F_PR_STORE_16 #define CK_F_PR_STORE_32 -#define CK_F_PR_STORE_64 #define CK_F_PR_STORE_8 #define CK_F_PR_STORE_CHAR #define CK_F_PR_STORE_INT diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index 2bb0dd7..eb8df18 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -52,6 +52,42 @@ #define CK_PR_LOCK_PREFIX "lock " #endif +/* + * Prevent speculative execution in busy-wait loops (P4 <=) + * or "predefined delay". + */ +CK_CC_INLINE static void +ck_pr_stall(void) +{ + __asm__ __volatile__("pause" ::: "memory"); + return; +} + +/* + * IA32 has strong memory ordering guarantees, so memory + * fences are enabled if and only if the user specifies that + * that the program will be using non-temporal instructions. + * Otherwise, an optimization barrier is used in order to prevent + * compiler re-ordering of loads and stores across the barrier. + */ +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__(I ::: "memory"); \ + } \ + CK_CC_INLINE static void ck_pr_fence_##T(void) \ + { \ + __asm__ __volatile__("" ::: "memory"); \ + } + +CK_PR_FENCE(load, "lfence") +CK_PR_FENCE(load_depends, "") +CK_PR_FENCE(store, "sfence") +CK_PR_FENCE(memory, "mfence") + +#undef CK_PR_FENCE + /* * Atomic fetch-and-store operations. */ @@ -517,9 +553,12 @@ ck_pr_cas_##S##_##W##_value(T *t, T c[W], T s[W], T *v) \ CK_PR_CAS_V(char, 8, char) CK_PR_CAS_V(int, 2, int) CK_PR_CAS_V(uint, 2, unsigned int) +CK_PR_CAS_V(64, 1, uint64_t) CK_PR_CAS_V(16, 4, uint16_t) CK_PR_CAS_V(8, 8, uint8_t) +#define ck_pr_cas_64_value(A, B, C, D) ck_pr_cas_64_1_value((A), &(B), &(C), (D)) + #undef CK_PR_CAS_V /* diff --git a/regressions/common.h b/regressions/common.h index 669fe30..d316580 100644 --- a/regressions/common.h +++ b/regressions/common.h @@ -78,7 +78,7 @@ aff_iterate(struct affinity *acb CK_CC_UNUSED) CK_CC_INLINE static uint64_t rdtsc(void) { -#if defined(__x86__) || defined(__x86_64__) +#if defined(__x86_64__) uint32_t eax = 0, edx; __asm__ __volatile__("cpuid;"