From 4d2ccfe497d9469fac05c54fc77e7c607f5d3ea9 Mon Sep 17 00:00:00 2001 From: Samy Al Bahra Date: Thu, 11 Jul 2013 20:59:42 -0400 Subject: [PATCH] ck_rwlock: Add basic RTM interface to rwlock. It is possible this will be moved to a self-contained file. For a majority of architectures, RTM is an unnecessary implementation-specific optimization. --- include/ck_rwlock.h | 111 ++++++++++ regressions/ck_rwlock/benchmark/latency.c | 36 +++- regressions/ck_rwlock/benchmark/throughput.c | 204 ++++++++++++++----- regressions/ck_rwlock/validate/validate.c | 202 +++++++++++++++--- 4 files changed, 465 insertions(+), 88 deletions(-) diff --git a/include/ck_rwlock.h b/include/ck_rwlock.h index 81587ac..69a0097 100644 --- a/include/ck_rwlock.h +++ b/include/ck_rwlock.h @@ -58,6 +58,21 @@ ck_rwlock_write_unlock(ck_rwlock_t *rw) return; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static void +ck_rwlock_write_unlock_rtm(ck_rwlock_t *rw) +{ + + if (ck_pr_load_uint(&rw->writer) == 0) { + ck_pr_rtm_end(); + return; + } + + ck_rwlock_write_unlock(rw); + return; +} +#endif /* CK_F_PR_RTM */ + CK_CC_INLINE static void ck_rwlock_write_downgrade(ck_rwlock_t *rw) { @@ -67,6 +82,25 @@ ck_rwlock_write_downgrade(ck_rwlock_t *rw) return; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static void +ck_rwlock_write_downgrade_rtm(ck_rwlock_t *rw) +{ + + if (ck_pr_load_uint(&rw->writer) != 0) { + ck_rwlock_write_downgrade(rw); + return; + } + + /* + * Both reader and writer counters are in read-set. A transactional + * abort will occur in the presence of another writer. Inner-most + * read_unlock call will attempt a transactional commit. + */ + return; +} +#endif /* CK_F_PR_RTM */ + CK_CC_INLINE static bool ck_rwlock_write_trylock(ck_rwlock_t *rw) { @@ -84,6 +118,27 @@ ck_rwlock_write_trylock(ck_rwlock_t *rw) return true; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static bool +ck_rwlock_write_trylock_rtm(ck_rwlock_t *rw) +{ + bool r; + + if (ck_pr_rtm_begin() != CK_PR_RTM_STARTED) { + return ck_rwlock_write_trylock(rw); + } + + r = ck_pr_load_uint(&rw->writer) != 0; + + ck_pr_fence_load(); + + if (r | (ck_pr_load_uint(&rw->n_readers) != 0)) + ck_pr_rtm_abort(0); + + return true; +} +#endif /* CK_F_PR_RTM */ + CK_CC_INLINE static void ck_rwlock_write_lock(ck_rwlock_t *rw) { @@ -99,6 +154,28 @@ ck_rwlock_write_lock(ck_rwlock_t *rw) return; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static void +ck_rwlock_write_lock_rtm(ck_rwlock_t *rw) +{ + bool r; + + if (ck_pr_rtm_begin() != CK_PR_RTM_STARTED) { + ck_rwlock_write_lock(rw); + return; + } + + r = ck_pr_load_uint(&rw->writer) != 0; + + ck_pr_fence_load(); + + if (r | (ck_pr_load_uint(&rw->n_readers) != 0)) + ck_pr_rtm_abort(0); + + return; +} +#endif /* CK_F_PR_RTM */ + CK_CC_INLINE static bool ck_rwlock_read_trylock(ck_rwlock_t *rw) { @@ -141,6 +218,7 @@ ck_rwlock_read_lock(ck_rwlock_t *rw) if (ck_pr_load_uint(&rw->writer) == 0) break; + ck_pr_dec_uint(&rw->n_readers); } @@ -149,6 +227,23 @@ ck_rwlock_read_lock(ck_rwlock_t *rw) return; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static void +ck_rwlock_read_lock_rtm(ck_rwlock_t *rw) +{ + + if (ck_pr_rtm_begin() == CK_PR_RTM_STARTED) { + if (ck_pr_load_uint(&rw->writer) != 0) + ck_pr_rtm_abort(0); + + return; + } + + ck_rwlock_read_lock(rw); + return; +} +#endif /* CK_F_PR_RTM */ + CK_CC_INLINE static void ck_rwlock_read_unlock(ck_rwlock_t *rw) { @@ -158,6 +253,21 @@ ck_rwlock_read_unlock(ck_rwlock_t *rw) return; } +#ifdef CK_F_PR_RTM +CK_CC_INLINE static void +ck_rwlock_read_unlock_rtm(ck_rwlock_t *rw) +{ + + if (ck_pr_load_uint(&rw->n_readers) == 0) { + ck_pr_rtm_end(); + } else { + ck_rwlock_read_unlock(rw); + } + + return; +} +#endif /* CK_F_PR_RTM */ + /* * Recursive writer reader-writer lock implementation. */ @@ -251,3 +361,4 @@ ck_rwlock_recursive_read_unlock(ck_rwlock_recursive_t *rw) } #endif /* _CK_RWLOCK_H */ + diff --git a/regressions/ck_rwlock/benchmark/latency.c b/regressions/ck_rwlock/benchmark/latency.c index 3b97661..fc702e0 100644 --- a/regressions/ck_rwlock/benchmark/latency.c +++ b/regressions/ck_rwlock/benchmark/latency.c @@ -51,7 +51,22 @@ main(void) ck_rwlock_write_unlock(&rwlock); } e_b = rdtsc(); - printf("WRITE: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); + printf(" WRITE: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); + +#ifdef CK_F_PR_RTM + for (i = 0; i < STEPS; i++) { + ck_rwlock_write_lock_rtm(&rwlock); + ck_rwlock_write_unlock_rtm(&rwlock); + } + + s_b = rdtsc(); + for (i = 0; i < STEPS; i++) { + ck_rwlock_write_lock_rtm(&rwlock); + ck_rwlock_write_unlock_rtm(&rwlock); + } + e_b = rdtsc(); + printf(" (rtm) WRITE: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); +#endif /* CK_F_PR_RTM */ for (i = 0; i < STEPS; i++) { ck_rwlock_read_lock(&rwlock); @@ -64,8 +79,23 @@ main(void) ck_rwlock_read_unlock(&rwlock); } e_b = rdtsc(); - printf("READ: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); + printf(" READ: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); + +#ifdef CK_F_PR_RTM + for (i = 0; i < STEPS; i++) { + ck_rwlock_read_lock_rtm(&rwlock); + ck_rwlock_read_unlock_rtm(&rwlock); + } + + s_b = rdtsc(); + for (i = 0; i < STEPS; i++) { + ck_rwlock_read_lock_rtm(&rwlock); + ck_rwlock_read_unlock_rtm(&rwlock); + } + e_b = rdtsc(); + printf(" (rtm) READ: rwlock %15" PRIu64 "\n", (e_b - s_b) / STEPS); +#endif /* CK_F_PR_RTM */ - return (0); + return 0; } diff --git a/regressions/ck_rwlock/benchmark/throughput.c b/regressions/ck_rwlock/benchmark/throughput.c index bb244f5..6ace58d 100644 --- a/regressions/ck_rwlock/benchmark/throughput.c +++ b/regressions/ck_rwlock/benchmark/throughput.c @@ -41,11 +41,17 @@ static int barrier; static int threads; static unsigned int flag CK_CC_CACHELINE; -static ck_rwlock_t rwlock = CK_RWLOCK_INITIALIZER; +static struct { + ck_rwlock_t lock; +} rw CK_CC_CACHELINE = { + .lock = CK_RWLOCK_INITIALIZER +}; + static struct affinity affinity; +#ifdef CK_F_PR_RTM static void * -thread_rwlock(void *pun) +thread_lock_rtm(void *pun) { uint64_t s_b, e_b, a, i; uint64_t *value = pun; @@ -61,38 +67,38 @@ thread_rwlock(void *pun) for (i = 1, a = 0;; i++) { s_b = rdtsc(); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); - ck_rwlock_read_lock(&rwlock); - ck_rwlock_read_unlock(&rwlock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); + ck_rwlock_read_lock_rtm(&rw.lock); + ck_rwlock_read_unlock_rtm(&rw.lock); e_b = rdtsc(); a += (e_b - s_b) >> 4; @@ -108,39 +114,87 @@ thread_rwlock(void *pun) *value = (a / i); return NULL; } +#endif /* CK_F_PR_RTM */ -int -main(int argc, char *argv[]) +static void * +thread_lock(void *pun) { - int t; - pthread_t *p; - uint64_t *latency; + uint64_t s_b, e_b, a, i; + uint64_t *value = pun; - if (argc != 3) { - ck_error("Usage: throughput \n"); + if (aff_iterate(&affinity) != 0) { + perror("ERROR: Could not affine thread"); + exit(EXIT_FAILURE); } - threads = atoi(argv[2]); - if (threads <= 0) { - ck_error("ERROR: Threads must be a value > 0.\n"); - } + ck_pr_inc_int(&barrier); + while (ck_pr_load_int(&barrier) != threads) + ck_pr_stall(); - p = malloc(sizeof(pthread_t) * threads); - if (p == NULL) { - ck_error("ERROR: Failed to initialize thread.\n"); - } + for (i = 1, a = 0;; i++) { + s_b = rdtsc(); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + ck_rwlock_read_lock(&rw.lock); + ck_rwlock_read_unlock(&rw.lock); + e_b = rdtsc(); - latency = malloc(sizeof(uint64_t) * threads); - if (latency == NULL) { - ck_error("ERROR: Failed to create latency buffer.\n"); + a += (e_b - s_b) >> 4; + + if (ck_pr_load_uint(&flag) == 1) + break; } - affinity.delta = atoi(argv[1]); + ck_pr_inc_int(&barrier); + while (ck_pr_load_int(&barrier) != threads * 2) + ck_pr_stall(); + + *value = (a / i); + return NULL; +} + +static void +rwlock_test(pthread_t *p, int d, uint64_t *latency, void *(*f)(void *), const char *label) +{ + int t; + + ck_pr_store_int(&barrier, 0); + ck_pr_store_uint(&flag, 0); + + affinity.delta = d; affinity.request = 0; - fprintf(stderr, "Creating threads (rwlock)..."); + fprintf(stderr, "Creating threads (%s)...", label); for (t = 0; t < threads; t++) { - if (pthread_create(&p[t], NULL, thread_rwlock, latency + t) != 0) { + if (pthread_create(&p[t], NULL, f, latency + t) != 0) { ck_error("ERROR: Could not create thread %d\n", t); } } @@ -157,6 +211,44 @@ main(int argc, char *argv[]) for (t = 1; t <= threads; t++) printf("%10u %20" PRIu64 "\n", t, latency[t - 1]); - return (0); + fprintf(stderr, "\n"); + return; +} + + +int +main(int argc, char *argv[]) +{ + int d; + pthread_t *p; + uint64_t *latency; + + if (argc != 3) { + ck_error("Usage: throughput \n"); + } + + threads = atoi(argv[2]); + if (threads <= 0) { + ck_error("ERROR: Threads must be a value > 0.\n"); + } + + p = malloc(sizeof(pthread_t) * threads); + if (p == NULL) { + ck_error("ERROR: Failed to initialize thread.\n"); + } + + latency = malloc(sizeof(uint64_t) * threads); + if (latency == NULL) { + ck_error("ERROR: Failed to create latency buffer.\n"); + } + + d = atoi(argv[1]); + rwlock_test(p, d, latency, thread_lock, "rwlock"); + +#ifdef CK_F_PR_RTM + rwlock_test(p, d, latency, thread_lock_rtm, "rwlock, rtm"); +#endif /* CK_F_PR_RTM */ + + return 0; } diff --git a/regressions/ck_rwlock/validate/validate.c b/regressions/ck_rwlock/validate/validate.c index 46ca1df..5c19afe 100644 --- a/regressions/ck_rwlock/validate/validate.c +++ b/regressions/ck_rwlock/validate/validate.c @@ -123,10 +123,156 @@ thread_recursive(void *null CK_CC_UNUSED) return (NULL); } +#ifdef CK_F_PR_RTM +static void * +thread_rtm_mix(void *null CK_CC_UNUSED) +{ + unsigned int i = ITERATE; + unsigned int l; + + if (aff_iterate(&a)) { + perror("ERROR: Could not affine thread"); + exit(EXIT_FAILURE); + } + + while (i--) { + if (i & 1) { + ck_rwlock_write_lock_rtm(&lock); + } else { + ck_rwlock_write_lock(&lock); + } + + { + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [WR:%d]: %u != 0\n", __LINE__, l); + } + + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + + l = ck_pr_load_uint(&locked); + if (l != 8) { + ck_error("ERROR [WR:%d]: %u != 2\n", __LINE__, l); + } + + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [WR:%d]: %u != 0\n", __LINE__, l); + } + } + + if (i & 1) { + ck_rwlock_write_unlock_rtm(&lock); + } else { + ck_rwlock_write_unlock(&lock); + } + + if (i & 1) { + ck_rwlock_read_lock_rtm(&lock); + } else { + ck_rwlock_read_lock(&lock); + } + + { + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [RD:%d]: %u != 0\n", __LINE__, l); + } + } + + if (i & 1) { + ck_rwlock_read_unlock_rtm(&lock); + } else { + ck_rwlock_read_unlock(&lock); + } + } + + return (NULL); +} + +static void * +thread_rtm(void *null CK_CC_UNUSED) +{ + unsigned int i = ITERATE; + unsigned int l; + + if (aff_iterate(&a)) { + perror("ERROR: Could not affine thread"); + exit(EXIT_FAILURE); + } + + while (i--) { + ck_rwlock_write_lock_rtm(&lock); + { + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [WR:%d]: %u != 0\n", __LINE__, l); + } + + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + ck_pr_inc_uint(&locked); + + l = ck_pr_load_uint(&locked); + if (l != 8) { + ck_error("ERROR [WR:%d]: %u != 2\n", __LINE__, l); + } + + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + ck_pr_dec_uint(&locked); + + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [WR:%d]: %u != 0\n", __LINE__, l); + } + } + ck_rwlock_write_unlock_rtm(&lock); + + ck_rwlock_read_lock_rtm(&lock); + { + l = ck_pr_load_uint(&locked); + if (l != 0) { + ck_error("ERROR [RD:%d]: %u != 0\n", __LINE__, l); + } + } + ck_rwlock_read_unlock_rtm(&lock); + } + + return (NULL); +} +#endif /* CK_F_PR_RTM */ + static void * thread(void *null CK_CC_UNUSED) { - int i = ITERATE; + unsigned int i = ITERATE; unsigned int l; if (aff_iterate(&a)) { @@ -185,11 +331,29 @@ thread(void *null CK_CC_UNUSED) return (NULL); } +static void +rwlock_test(pthread_t *threads, void *(*f)(void *), const char *test) +{ + int i; + + fprintf(stderr, "Creating threads (%s)...", test); + for (i = 0; i < nthr; i++) { + if (pthread_create(&threads[i], NULL, f, NULL)) { + ck_error("ERROR: Could not create thread %d\n", i); + } + } + fprintf(stderr, "."); + + for (i = 0; i < nthr; i++) + pthread_join(threads[i], NULL); + fprintf(stderr, "done (passed)\n"); + return; +} + int main(int argc, char *argv[]) { pthread_t *threads; - int i; if (argc != 3) { ck_error("Usage: validate \n"); @@ -207,32 +371,12 @@ main(int argc, char *argv[]) a.delta = atoi(argv[2]); - fprintf(stderr, "Creating threads (mutual exclusion)..."); - for (i = 0; i < nthr; i++) { - if (pthread_create(&threads[i], NULL, thread, NULL)) { - ck_error("ERROR: Could not create thread %d\n", i); - } - } - fprintf(stderr, "done\n"); - - fprintf(stderr, "Waiting for threads to finish correctness regression..."); - for (i = 0; i < nthr; i++) - pthread_join(threads[i], NULL); - fprintf(stderr, "done (passed)\n"); - - fprintf(stderr, "Creating threads (mutual exclusion, recursive)..."); - for (i = 0; i < nthr; i++) { - if (pthread_create(&threads[i], NULL, thread_recursive, NULL)) { - ck_error("ERROR: Could not create thread %d\n", i); - } - } - fprintf(stderr, "done\n"); - - fprintf(stderr, "Waiting for threads to finish correctness regression..."); - for (i = 0; i < nthr; i++) - pthread_join(threads[i], NULL); - fprintf(stderr, "done (passed)\n"); - - return (0); + rwlock_test(threads, thread, "regular"); +#ifdef CK_F_PR_RTM + rwlock_test(threads, thread_rtm, "rtm"); + rwlock_test(threads, thread_rtm_mix, "rtm-mix"); +#endif + rwlock_test(threads, thread_recursive, "recursive"); + return 0; }