Merge branch 'master' of https://github.com/sbahra/ck

12 years ago · f9ea659bff
parent eacaacdb6e 3ca7072c14
commit f9ea659bff
33 changed files with 1087 additions and 291 deletions
--- a/.gitignore
+++ b/.gitignore
@ -140,6 +140,7 @@ regressions/ck_brlock/benchmark/throughput
 regressions/ck_rwlock/benchmark/throughput
 regressions/ck_queue/validate/ck_list
 regressions/ck_queue/validate/ck_slist
+regressions/ck_queue/validate/ck_stailq
 regressions/ck_cohort/validate/validate
 regressions/ck_cohort/benchmark/ck_cohort.LATENCY
 regressions/ck_cohort/benchmark/ck_cohort.THROUGHPUT
--- a/5
+++ b/5
@ -579,11 +579,10 @@ mkdir -p $P_PWD/src
 if test "$P_PWD" '!=' "$BUILD_DIR"; then
 	mkdir -p $P_PWD/regressions
 	cp $BUILD_DIR/regressions/Makefile.unsupported $P_PWD/regressions/Makefile &> /dev/null
+	cp $BUILD_DIR/build/ck.build.$PROFILE $P_PWD/build/ck.build.$PROFILE &> /dev/null
+	cp $BUILD_DIR/include/ck_md.h $P_PWD/include/ck_md.h &> /dev/null
 fi

-cp $BUILD_DIR/build/ck.build.$PROFILE $P_PWD/build/ck.build.$PROFILE &> /dev/null
-cp $BUILD_DIR/include/ck_md.h $P_PWD/include/ck_md.h &> /dev/null
-
 generate src/Makefile.in $P_PWD/src/Makefile
 generate doc/Makefile.in $P_PWD/doc/Makefile
 generate build/ck.build.in $P_PWD/build/ck.build
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@ -91,10 +91,17 @@ OBJECTS=ck_ht_count 	  		\
 	ck_pr				\
 	ck_pr_barrier			\
 	ck_pr_fas			\
+	ck_pr_fence_atomic		\
+	ck_pr_fence_atomic_load		\
+	ck_pr_fence_atomic_store	\
 	ck_pr_fence_load		\
+	ck_pr_fence_load_atomic		\
+	ck_pr_fence_load_store		\
 	ck_pr_fence_load_depends	\
 	ck_pr_fence_memory		\
 	ck_pr_fence_store		\
+	ck_pr_fence_store_atomic	\
+	ck_pr_fence_store_load		\
 	ck_pr_stall			\
 	ck_pr_faa			\
 	ck_pr_inc			\
--- a/doc/ck_pr_fence_atomic
+++ b/doc/ck_pr_fence_atomic
@ -0,0 +1,111 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 16, 2013
+.Dt CK_PR_FENCE_ATOMIC 3 
+.Sh NAME
+.Nm ck_pr_fence_atomic
+.Nd enforce partial ordering of atomic read-modify-write operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_atomic void
+.Ft void
+.Fn ck_pr_fence_strict_atomic void
+.Sh DESCRIPTION
+The 
+.Fn ck_pr_fence_atomic
+function enfores the ordering of any 
+atomic read-modify-write operations relative to
+the invocation of the function. This function
+always serve as an implicit compiler barrier. On
+architectures implementing CK_MD_TSO, this operation
+only serves as a compiler barrier and no fences
+are emitted. On architectures implementing
+CK_MD_PSO and CK_MD_RMO, a store fence is
+emitted. To force the unconditional emission of
+a fence, use
+.Fn ck_pr_fence_strict_atomic .
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static int a = 0;
+static int b = 0;
+static int c = 0;
+
+void
+function(void)
+{
+
+	ck_pr_fas_int(&a, 1);
+
+	/*
+	 * Guarantee that the update to a is completed
+	 * with respect to the updates of b and c.
+	 */
+	ck_pr_fence_atomic();
+	ck_pr_fas_int(&b, 2);
+	ck_pr_fas_int(&c, 2);
+
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_atomic_load
+++ b/doc/ck_pr_fence_atomic_load
@ -0,0 +1,108 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 16, 2013
+.Dt CK_PR_FENCE_ATOMIC_LOAD 3 
+.Sh NAME
+.Nm ck_pr_fence_atomic_load
+.Nd enforce ordering of atomic read-modify-write operations to load operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_atomic_load void
+.Ft void
+.Fn ck_pr_fence_strict_atomic_load void
+.Sh DESCRIPTION
+The 
+.Fn ck_pr_fence_atomic_load
+function enfores the ordering of any 
+atomic read-modify-write operations relative to
+any load operations following the function invocation. This function
+always serve as an implicit compiler barrier. On
+architectures implementing CK_MD_TSO, this operation
+only serves as a compiler barrier and no fences
+are emitted. To force the unconditional emission of
+a fence, use
+.Fn ck_pr_fence_strict_atomic_load .
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static int a = 0;
+static int b = 0;
+
+void
+function(void)
+{
+	int c;
+
+	ck_pr_fas_int(&a, 1);
+
+	/*
+	 * Guarantee that the update to a is completed
+	 * with respect to the load of *b.
+	 */
+	ck_pr_fence_atomic_load();
+	c = ck_pr_load_int(&b);
+
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_atomic_store
+++ b/doc/ck_pr_fence_atomic_store
@ -0,0 +1,109 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 16, 2013
+.Dt CK_PR_FENCE_ATOMIC_STORE 3 
+.Sh NAME
+.Nm ck_pr_fence_atomic_store
+.Nd enforce ordering of atomic read-modify-write operations to store operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_atomic_store void
+.Ft void
+.Fn ck_pr_fence_strict_atomic_store void
+.Sh DESCRIPTION
+The 
+.Fn ck_pr_fence_atomic_store
+function enfores the ordering of any 
+atomic read-modify-write operations relative to
+any load operations following the function invocation. This function
+always serve as an implicit compiler barrier. On
+architectures implementing CK_MD_TSO, this operation
+only serves as a compiler barrier and no fences
+are emitted. To force the unconditional emission of
+a fence, use
+.Fn ck_pr_fence_strict_atomic_store .
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static int a = 0;
+static int b = 0;
+
+void
+function(void)
+{
+	int c;
+
+	ck_pr_fas_int(&a, 1);
+
+	/*
+	 * Guarantee that the update to a is completed
+	 * with respect to the store into the value pointed
+	 * to by b.
+	 */
+	ck_pr_fence_atomic_store();
+	c = ck_pr_store_int(&b, 2);
+
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_load
+++ b/doc/ck_pr_fence_load
@ -83,6 +83,11 @@ function(void)
 This function has no return value.
 .Sh SEE ALSO
 .Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
 .Xr ck_pr_fence_load_depends 3 ,
 .Xr ck_pr_fence_store 3 ,
 .Xr ck_pr_fence_memory 3 ,
--- a/doc/ck_pr_fence_load_atomic
+++ b/doc/ck_pr_fence_load_atomic
@ -0,0 +1,113 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 18, 2013
+.Dt CK_PR_FENCE_LOAD_ATOMIC 3
+.Sh NAME
+.Nm ck_pr_fence_load_atomic
+.Nd enforce ordering of load operations to atomic read-modify-write operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_load_atomic void
+.Ft void
+.Fn ck_pr_fence_strict_load_atomic void
+.Sh DESCRIPTION
+This function enforces the ordering of any memory load
+and
+.Fn ck_pr_load 3
+operations with respect to store operations relative to
+the invocation of the function. Any store operations that
+were committed on remote processors
+and received by the calling processor before the invocation of
+.Fn ck_pr_fence_load_atomic
+is also be made visible only after a call to
+the ck_pr_fence_load family of functions.
+This function always serves as an implicit compiler barrier.
+On architectures with CK_MD_TSO or CK_MD_PSO specified (total store ordering
+and partial store ordering respectively), this operation only serves
+as a compiler barrier and no fence instructions will be emitted. To
+force the unconditional emission of a load fence, use
+.Fn ck_pr_fence_strict_load_atomic .
+Architectures implementing CK_MD_RMO always emit a fence.
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static unsigned int a;
+static unsigned int b;
+
+void
+function(void)
+{
+	unsigned int snapshot_a, snapshot_b;
+
+	snapshot_a = ck_pr_load_uint(&a);
+
+	/*
+	 * Guarantee that the load from "a" completes
+	 * before the update to "b".
+	 */
+	ck_pr_fence_load_atomic();
+	ck_pr_fas_uint(&b, 1);
+
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_load_depends
+++ b/doc/ck_pr_fence_load_depends
@ -45,7 +45,12 @@ which re-orders data-dependent loads (such as the defunct Alpha), this function
 This function has no return value.
 .Sh SEE ALSO
 .Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
 .Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
 .Xr ck_pr_fence_store 3 ,
 .Xr ck_pr_fence_memory 3 ,
 .Xr ck_pr_barrier 3 ,
--- a/doc/ck_pr_fence_load_store
+++ b/doc/ck_pr_fence_load_store
@ -0,0 +1,113 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 18, 2013
+.Dt CK_PR_FENCE_LOAD_STORE 3
+.Sh NAME
+.Nm ck_pr_fence_load_store
+.Nd enforce ordering of load operations to store operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_load_store void
+.Ft void
+.Fn ck_pr_fence_strict_load_store void
+.Sh DESCRIPTION
+This function enforces the ordering of any memory load
+and
+.Fn ck_pr_load 3
+operations with respect to store operations relative to
+the invocation of the function. Any store operations that
+were committed on remote processors
+and received by the calling processor before the invocation of
+.Fn ck_pr_fence_load_store
+is also be made visible only after a call to
+the ck_pr_fence_load family of functions.
+This function always serves as an implicit compiler barrier.
+On architectures with CK_MD_TSO or CK_MD_PSO specified (total store ordering
+and partial store ordering respectively), this operation only serves
+as a compiler barrier and no fence instructions will be emitted. To
+force the unconditional emission of a load fence, use
+.Fn ck_pr_fence_strict_load_store .
+Architectures implementing CK_MD_RMO always emit a fence.
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static unsigned int a;
+static unsigned int b;
+
+void
+function(void)
+{
+	unsigned int snapshot_a;
+
+	snapshot_a = ck_pr_load_uint(&a);
+
+	/*
+	 * Guarantee that the load from "a" completes
+	 * before the store to "b".
+	 */
+	ck_pr_fence_load_store();
+	ck_pr_store_uint(&b, 1);
+
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_memory
+++ b/doc/ck_pr_fence_memory
@ -85,6 +85,9 @@ function(void)
 This function has no return value.
 .Sh SEE ALSO
 .Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
 .Xr ck_pr_fence_load 3 ,
 .Xr ck_pr_fence_load_depends 3 ,
 .Xr ck_pr_fence_store 3 ,
--- a/doc/ck_pr_fence_store
+++ b/doc/ck_pr_fence_store
@ -82,7 +82,12 @@ function(void)
 This function has no return value.
 .Sh SEE ALSO
 .Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
 .Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
 .Xr ck_pr_fence_load_depends 3 ,
 .Xr ck_pr_fence_memory 3 ,
 .Xr ck_pr_barrier 3 ,
--- a/doc/ck_pr_fence_store_atomic
+++ b/doc/ck_pr_fence_store_atomic
@ -0,0 +1,108 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 18, 2013
+.Dt CK_PR_FENCE_STORE_ATOMIC 3
+.Sh NAME
+.Nm ck_pr_fence_store_atomic
+.Nd enforce ordering of store operations to load operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_store_atomic void
+.Ft void
+.Fn ck_pr_fence_strict_store_atomic void
+.Sh DESCRIPTION
+The 
+.Fn ck_pr_fence_store_atomic
+function enfores the ordering of any memory store,
+.Fn ck_pr_store
+and atomic read-modify-write operations to atomic read-modify-write
+operations relative to the invocation of the function. This function
+always serve as an implicit compiler barrier. 
+This functions will emit a fence for PSO and RMO
+targets. In order to force the emission of a fence use the
+.Fn ck_pr_fence_strict_store_atomic
+function.
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static int a = 0;
+static int b = 0;
+
+void
+function(void)
+{
+
+	ck_pr_store_int(&a, 1);
+
+	/*
+	 * Guarantee that the store to a is completed
+	 * with respect to the update of b.
+	 */
+	ck_pr_fence_store_atomic();
+	ck_pr_add_int(&b, 2);
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_store_load 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/doc/ck_pr_fence_store_load
+++ b/doc/ck_pr_fence_store_load
@ -0,0 +1,107 @@
+.\"
+.\" Copyright 2013 Samy Al Bahra.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd May 18, 2013
+.Dt CK_PR_FENCE_STORE_LOAD 3
+.Sh NAME
+.Nm ck_pr_fence_store_load
+.Nd enforce ordering of store operations to load operations
+.Sh LIBRARY
+Concurrency Kit (libck, \-lck)
+.Sh SYNOPSIS
+.In ck_pr.h
+.Ft void
+.Fn ck_pr_fence_store_load void
+.Ft void
+.Fn ck_pr_fence_strict_store_load void
+.Sh DESCRIPTION
+The 
+.Fn ck_pr_fence_store_load
+function enfores the ordering of any memory store,
+.Fn ck_pr_store
+and atomic read-modify-write operations to load
+operations relative to the invocation of the function. This function
+always serve as an implicit compiler barrier. 
+A fence will currently always be emitted for this
+operation, including for TSO memory model targets.
+.Sh EXAMPLE
+.Bd -literal -offset indent
+
+#include <ck_pr.h>
+
+static int a = 0;
+static int b = 0;
+
+void
+function(void)
+{
+	unsigned int snapshot_b;
+
+	ck_pr_store_int(&a, 1);
+
+	/*
+	 * Guarantee that the store to a is completed
+	 * with respect to load from b.
+	 */
+	ck_pr_fence_store_load();
+	snapshot_b = ck_pr_load_int(&b, 2);
+	return;
+}
+.Ed
+.Sh RETURN VALUES
+This function has no return value.
+.Sh SEE ALSO
+.Xr ck_pr_stall 3 ,
+.Xr ck_pr_fence_atomic 3 ,
+.Xr ck_pr_fence_atomic_store 3 ,
+.Xr ck_pr_fence_atomic_load 3 ,
+.Xr ck_pr_fence_load 3 ,
+.Xr ck_pr_fence_load_atomic 3 ,
+.Xr ck_pr_fence_load_store 3 ,
+.Xr ck_pr_fence_load_depends 3 ,
+.Xr ck_pr_fence_store 3 ,
+.Xr ck_pr_fence_store_atomic 3 ,
+.Xr ck_pr_fence_memory 3 ,
+.Xr ck_pr_barrier 3 ,
+.Xr ck_pr_fas 3 ,
+.Xr ck_pr_load 3 ,
+.Xr ck_pr_store 3 ,
+.Xr ck_pr_faa 3 ,
+.Xr ck_pr_inc 3 ,
+.Xr ck_pr_dec 3 ,
+.Xr ck_pr_neg 3 , 
+.Xr ck_pr_not 3 ,
+.Xr ck_pr_add 3 ,
+.Xr ck_pr_sub 3 ,
+.Xr ck_pr_and 3 ,
+.Xr ck_pr_or 3 ,
+.Xr ck_pr_xor 3 ,
+.Xr ck_pr_cas 3 ,
+.Xr ck_pr_btc 3 ,
+.Xr ck_pr_bts 3 ,
+.Xr ck_pr_btr 3
+.Pp
+Additional information available at http://concurrencykit.org/
--- a/include/ck_brlock.h
+++ b/include/ck_brlock.h
@ -83,7 +83,7 @@ ck_brlock_write_lock(struct ck_brlock *br)
 	while (ck_pr_fas_uint(&br->writer, true) == true)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	/* The reader list is protected under the writer br. */
 	for (cursor = br->readers; cursor != NULL; cursor = cursor->next) {
@ -121,7 +121,7 @@ ck_brlock_write_trylock(struct ck_brlock *br, unsigned int factor)
 	 * We do not require a strict fence here as atomic RMW operations
 	 * are serializing.
 	 */
-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	for (cursor = br->readers; cursor != NULL; cursor = cursor->next) {
 		while (ck_pr_load_uint(&cursor->n_readers) != 0) {
@ -190,13 +190,19 @@ ck_brlock_read_lock(struct ck_brlock *br, struct ck_brlock_reader *reader)
 #if defined(__x86__) || defined(__x86_64__)
 		ck_pr_fas_uint(&reader->n_readers, 1);

-		/* Serialize counter update with respect to writer snapshot. */
-		ck_pr_fence_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_atomic_load();
 #else
 		ck_pr_store_uint(&reader->n_readers, 1);

-		/* Loads can be re-ordered before previous stores, even on TSO. */
-		ck_pr_fence_strict_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_store_load();
 #endif

 		if (ck_pr_load_uint(&br->writer) == false)
@ -229,10 +235,23 @@ ck_brlock_read_trylock(struct ck_brlock *br,
 			ck_pr_stall();
 		}

+#if defined(__x86__) || defined(__x86_64__)
+		ck_pr_fas_uint(&reader->n_readers, 1);
+
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_atomic_load();
+#else
 		ck_pr_store_uint(&reader->n_readers, 1);

-		/* Loads are re-ordered with respect to prior stores. */
-		ck_pr_fence_strict_memory();
+		/*
+		 * Serialize reader counter update with respect to load of
+		 * writer.
+		 */
+		ck_pr_fence_store_load();
+#endif

 		if (ck_pr_load_uint(&br->writer) == false)
 			break;
--- a/include/ck_bytelock.h
+++ b/include/ck_bytelock.h
@ -93,7 +93,7 @@ ck_bytelock_write_lock(struct ck_bytelock *bytelock, unsigned int slot)
 		ck_pr_store_8(&bytelock->readers[slot - 1], false);

 	/* Wait for slotted readers to drain out. */
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	for (i = 0; i < sizeof(bytelock->readers) / CK_BYTELOCK_LENGTH; i++) {
 		while (CK_BYTELOCK_LOAD((CK_BYTELOCK_TYPE *)&readers[i]) != false)
 			ck_pr_stall();
@ -134,7 +134,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot)
 	if (slot > sizeof bytelock->readers) {
 		for (;;) {
 			ck_pr_inc_uint(&bytelock->n_readers);
-			ck_pr_fence_memory();
+			ck_pr_fence_atomic_load();
 			if (ck_pr_load_uint(&bytelock->owner) == 0)
 				break;
 			ck_pr_dec_uint(&bytelock->n_readers);
@ -150,7 +150,7 @@ ck_bytelock_read_lock(struct ck_bytelock *bytelock, unsigned int slot)
 	slot -= 1;
 	for (;;) {
 		ck_pr_store_8(&bytelock->readers[slot], true);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();

 		/*
 		 * If there is no owner at this point, our slot has
--- a/include/ck_epoch.h
+++ b/include/ck_epoch.h
@ -97,12 +97,11 @@ ck_epoch_begin(ck_epoch_t *epoch, ck_epoch_record_t *record)
 		/*
 		 * It is possible for loads to be re-ordered before the store
 		 * is committed into the caller's epoch and active fields.
-		 * Execute a full barrier to serialize stores with respect to
-		 * loads
+		 * For this reason, store to load serialization is necessary.
 		 */
 		ck_pr_store_uint(&record->epoch, g_epoch);
 		ck_pr_store_uint(&record->active, 1);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		return;
 	}

--- a/include/ck_fifo.h
+++ b/include/ck_fifo.h
@ -237,7 +237,7 @@ ck_fifo_mpmc_enqueue(struct ck_fifo_mpmc *fifo,
 	entry->value = value;
 	entry->next.pointer = NULL;
 	entry->next.generation = 0;
-	ck_pr_fence_store();
+	ck_pr_fence_store_atomic();

 	for (;;) {
 		tail.generation = ck_pr_load_ptr(&fifo->tail.generation);
@ -271,9 +271,10 @@ ck_fifo_mpmc_enqueue(struct ck_fifo_mpmc *fifo,
 		}
 	}

+	ck_pr_fence_atomic();
+
 	/* After a successful insert, forward the tail to the new entry. */
 	update.generation = tail.generation + 1;
-	ck_pr_fence_store();
 	ck_pr_cas_ptr_2(&fifo->tail, &tail, &update);
 	return;
 }
@ -289,7 +290,7 @@ ck_fifo_mpmc_tryenqueue(struct ck_fifo_mpmc *fifo,
 	entry->next.pointer = NULL;
 	entry->next.generation = 0;

-	ck_pr_fence_store();
+	ck_pr_fence_store_atomic();

 	tail.generation = ck_pr_load_ptr(&fifo->tail.generation);
 	ck_pr_fence_load();
@ -322,8 +323,9 @@ ck_fifo_mpmc_tryenqueue(struct ck_fifo_mpmc *fifo,
 			return false;
 	}

+	ck_pr_fence_atomic();
+
 	/* After a successful insert, forward the tail to the new entry. */
-	ck_pr_fence_store();
 	update.generation = tail.generation + 1;
 	ck_pr_cas_ptr_2(&fifo->tail, &tail, &update);
 	return true;
--- a/include/ck_hp_fifo.h
+++ b/include/ck_hp_fifo.h
@ -76,12 +76,12 @@ ck_hp_fifo_enqueue_mpmc(ck_hp_record_t *record,

 	entry->value = value;
 	entry->next = NULL;
-	ck_pr_fence_store();
+	ck_pr_fence_store_atomic();

 	for (;;) {
 		tail = ck_pr_load_ptr(&fifo->tail);
 		ck_hp_set(record, 0, tail);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (tail != ck_pr_load_ptr(&fifo->tail))
 			continue;

@ -93,7 +93,7 @@ ck_hp_fifo_enqueue_mpmc(ck_hp_record_t *record,
 			break;
 	}

-	ck_pr_fence_store();
+	ck_pr_fence_atomic();
 	ck_pr_cas_ptr(&fifo->tail, tail, entry);
 	return;
 }
@ -108,11 +108,11 @@ ck_hp_fifo_tryenqueue_mpmc(ck_hp_record_t *record,

 	entry->value = value;
 	entry->next = NULL;
-	ck_pr_fence_store();
+	ck_pr_fence_store_atomic();

 	tail = ck_pr_load_ptr(&fifo->tail);
 	ck_hp_set(record, 0, tail);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (tail != ck_pr_load_ptr(&fifo->tail))
 		return false;

@ -123,7 +123,7 @@ ck_hp_fifo_tryenqueue_mpmc(ck_hp_record_t *record,
 	} else if (ck_pr_cas_ptr(&fifo->tail->next, next, entry) == false)
 		return false;

-	ck_pr_fence_store();
+	ck_pr_fence_atomic();
 	ck_pr_cas_ptr(&fifo->tail, tail, entry);
 	return true;
 }
@ -140,13 +140,13 @@ ck_hp_fifo_dequeue_mpmc(ck_hp_record_t *record,
 		ck_pr_fence_load();
 		tail = ck_pr_load_ptr(&fifo->tail);
 		ck_hp_set(record, 0, head);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (head != ck_pr_load_ptr(&fifo->head))
 			continue;

 		next = ck_pr_load_ptr(&head->next);
 		ck_hp_set(record, 1, next);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		if (head != ck_pr_load_ptr(&fifo->head))
 			continue;

@ -175,13 +175,13 @@ ck_hp_fifo_trydequeue_mpmc(ck_hp_record_t *record,
 	ck_pr_fence_load();
 	tail = ck_pr_load_ptr(&fifo->tail);
 	ck_hp_set(record, 0, head);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (head != ck_pr_load_ptr(&fifo->head))
 		return NULL;

 	next = ck_pr_load_ptr(&head->next);
 	ck_hp_set(record, 1, next);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (head != ck_pr_load_ptr(&fifo->head))
 		return NULL;

--- a/include/ck_hp_stack.h
+++ b/include/ck_hp_stack.h
@ -62,7 +62,7 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target)
 			return NULL;

 		ck_hp_set(record, 0, entry);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 	} while (entry != ck_pr_load_ptr(&target->head));

 	while (ck_pr_cas_ptr_value(&target->head, entry, entry->next, &entry) == false) {
@ -70,11 +70,11 @@ ck_hp_stack_pop_mpmc(ck_hp_record_t *record, struct ck_stack *target)
 			return NULL;

 		ck_hp_set(record, 0, entry);
-		ck_pr_fence_strict_memory();
+		ck_pr_fence_store_load();
 		update = ck_pr_load_ptr(&target->head);
 		while (entry != update) {
 			ck_hp_set(record, 0, update);
-			ck_pr_fence_strict_memory();
+			ck_pr_fence_store_load();
 			entry = update;
 			update = ck_pr_load_ptr(&target->head);
 			if (update == NULL)
@ -95,7 +95,7 @@ ck_hp_stack_trypop_mpmc(ck_hp_record_t *record, struct ck_stack *target, struct
 		return false;

 	ck_hp_set(record, 0, entry);
-	ck_pr_fence_strict_memory();
+	ck_pr_fence_store_load();
 	if (entry != ck_pr_load_ptr(&target->head))
 		goto leave;

--- a/include/ck_pr.h
+++ b/include/ck_pr.h
@ -30,6 +30,7 @@

 #include <ck_cc.h>
 #include <ck_limits.h>
+#include <ck_md.h>
 #include <ck_stdint.h>
 #include <stdbool.h>

@ -43,12 +44,90 @@
 #include "gcc/ppc64/ck_pr.h"
 #elif defined(__ppc__)
 #include "gcc/ppc/ck_pr.h"
-#elif defined(__GNUC__)
-#include "gcc/ck_pr.h"
-#else
+#elif !defined(__GNUC__)
 #error Your platform is unsupported
 #endif

+#if defined(__GNUC__)
+#include "gcc/ck_pr.h"
+#endif
+
+#define CK_PR_FENCE_EMIT(T)			\
+	CK_CC_INLINE static void		\
+	ck_pr_fence_##T(void)			\
+	{					\
+		ck_pr_fence_strict_##T();	\
+		return;				\
+	}
+#define CK_PR_FENCE_NOOP(T)			\
+	CK_CC_INLINE static void		\
+	ck_pr_fence_##T(void)			\
+	{					\
+		ck_pr_barrier();		\
+		return;				\
+	}
+
+/*
+ * None of the currently supported platforms allow for data-dependent
+ * load ordering.
+ */
+CK_PR_FENCE_NOOP(load_depends)
+#define ck_pr_fence_strict_load_depends ck_pr_fence_load_depends
+
+/*
+ * In memory models where atomic operations do not have serializing
+ * effects, atomic read-modify-write operations are modeled as stores.
+ */
+#if defined(CK_MD_RMO)
+/*
+ * Only stores to the same location have a global
+ * ordering.
+ */
+CK_PR_FENCE_EMIT(atomic)
+CK_PR_FENCE_EMIT(atomic_load)
+CK_PR_FENCE_EMIT(atomic_store)
+CK_PR_FENCE_EMIT(store_atomic)
+CK_PR_FENCE_EMIT(load_atomic)
+CK_PR_FENCE_EMIT(load_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_EMIT(load)
+CK_PR_FENCE_EMIT(store)
+CK_PR_FENCE_EMIT(memory)
+#elif defined(CK_MD_PSO)
+/*
+ * Anything can be re-ordered with respect to stores.
+ * Otherwise, loads are executed in-order.
+ */
+CK_PR_FENCE_EMIT(atomic)
+CK_PR_FENCE_NOOP(atomic_load)
+CK_PR_FENCE_EMIT(atomic_store)
+CK_PR_FENCE_EMIT(store_atomic)
+CK_PR_FENCE_NOOP(load_atomic)
+CK_PR_FENCE_EMIT(load_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_NOOP(load)
+CK_PR_FENCE_EMIT(store)
+CK_PR_FENCE_EMIT(memory)
+#elif defined(CK_MD_TSO)
+/*
+ * Only loads are re-ordered and only with respect to
+ * prior stores. Atomic operations are serializing.
+ */
+CK_PR_FENCE_NOOP(atomic)
+CK_PR_FENCE_NOOP(atomic_load)
+CK_PR_FENCE_NOOP(atomic_store)
+CK_PR_FENCE_NOOP(store_atomic)
+CK_PR_FENCE_NOOP(load_atomic)
+CK_PR_FENCE_NOOP(load_store)
+CK_PR_FENCE_EMIT(store_load)
+CK_PR_FENCE_NOOP(load)
+CK_PR_FENCE_NOOP(store)
+CK_PR_FENCE_NOOP(memory)
+#endif /* CK_MD_TSO */
+
+#undef CK_PR_FENCE_EMIT
+#undef CK_PR_FENCE_NOOP
+
 #define CK_PR_BIN(K, S, M, T, P, C)					\
 	CK_CC_INLINE static void					\
 	ck_pr_##K##_##S(M *target, T value)				\
--- a/include/ck_rwlock.h
+++ b/include/ck_rwlock.h
@ -74,7 +74,8 @@ ck_rwlock_write_trylock(ck_rwlock_t *rw)
 	if (ck_pr_fas_uint(&rw->writer, 1) != 0)
 		return false;

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();
+
 	if (ck_pr_load_uint(&rw->n_readers) != 0) {
 		ck_rwlock_write_unlock(rw);
 		return false;
@ -90,7 +91,7 @@ ck_rwlock_write_lock(ck_rwlock_t *rw)
 	while (ck_pr_fas_uint(&rw->writer, 1) != 0)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	while (ck_pr_load_uint(&rw->n_readers) != 0)
 		ck_pr_stall();
@ -111,16 +112,15 @@ ck_rwlock_read_trylock(ck_rwlock_t *rw)
 	 * Serialize with respect to concurrent write
 	 * lock operation.
 	 */
-	ck_pr_fence_memory();
-	if (ck_pr_load_uint(&rw->writer) == 0)
-		goto leave;
+	ck_pr_fence_atomic_load();
+
+	if (ck_pr_load_uint(&rw->writer) == 0) {
+		ck_pr_fence_load();
+		return true;
+	}
+
 	ck_pr_dec_uint(&rw->n_readers);
 	return false;
-
-leave:
-	/* Acquire semantics are necessary. */
-	ck_pr_fence_load();
-	return true;
 }

 CK_CC_INLINE static void
@ -137,7 +137,8 @@ ck_rwlock_read_lock(ck_rwlock_t *rw)
 		 * Serialize with respect to concurrent write
 		 * lock operation.
 		 */
-		ck_pr_fence_memory();
+		ck_pr_fence_atomic_load();
+
 		if (ck_pr_load_uint(&rw->writer) == 0)
 			break;
 		ck_pr_dec_uint(&rw->n_readers);
@ -180,7 +181,7 @@ ck_rwlock_recursive_write_lock(ck_rwlock_recursive_t *rw, unsigned int tid)
 	while (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false)
 		ck_pr_stall();

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	while (ck_pr_load_uint(&rw->rw.n_readers) != 0)
 		ck_pr_stall();
@ -202,7 +203,7 @@ ck_rwlock_recursive_write_trylock(ck_rwlock_recursive_t *rw, unsigned int tid)
 	if (ck_pr_cas_uint(&rw->rw.writer, 0, tid) == false)
 		return false;

-	ck_pr_fence_memory();
+	ck_pr_fence_atomic_load();

 	if (ck_pr_load_uint(&rw->rw.n_readers) != 0) {
 		ck_pr_store_uint(&rw->rw.writer, 0);
--- a/include/ck_spinlock.h
+++ b/include/ck_spinlock.h
@ -142,7 +142,7 @@ ck_spinlock_anderson_lock(struct ck_spinlock_anderson *lock,

 	/* Prepare slot for potential re-use by another thread. */
 	ck_pr_store_uint(&lock->slots[position].locked, true);
-	ck_pr_fence_store();
+	ck_pr_fence_memory();

 	*slot = lock->slots + position;
 	return;
@ -194,7 +194,7 @@ ck_spinlock_fas_trylock(struct ck_spinlock_fas *lock)
 	if (value == false)
 		ck_pr_fence_memory();

-	return (!value);
+	return !value;
 }

 CK_CC_INLINE static bool
@ -268,7 +268,7 @@ ck_spinlock_cas_trylock(struct ck_spinlock_cas *lock)
 	if (value == false)
 		ck_pr_fence_memory();

-	return (!value);
+	return !value;
 }

 CK_CC_INLINE static bool
@ -408,9 +408,9 @@ ck_spinlock_dec_unlock(struct ck_spinlock_dec *lock)
 /*
 * If 16-bit or 32-bit increment is supported, implement support for
 * trylock functionality on availability of 32-bit or 64-bit fetch-and-add
- * and compare-and-swap.
+ * and compare-and-swap. This code path is only applied to x86*.
 */
-#if defined(CK_MD_TSO)
+#if defined(CK_MD_TSO) && (defined(__x86__) || defined(__x86_64__))
 #if defined(CK_F_PR_FAA_32) && defined(CK_F_PR_INC_16) && defined(CK_F_PR_CAS_32)
 #define CK_SPINLOCK_TICKET_TYPE         uint32_t
 #define CK_SPINLOCK_TICKET_TYPE_BASE    uint16_t
@ -658,9 +658,9 @@ CK_CC_INLINE static bool
 ck_spinlock_mcs_trylock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *node)
 {

-	ck_pr_store_uint(&node->locked, true);
-	ck_pr_store_ptr(&node->next, NULL);
-	ck_pr_fence_store();
+	node->locked = true;
+	node->next = NULL;
+	ck_pr_fence_store_atomic();

 	if (ck_pr_cas_ptr(queue, NULL, node) == true) {
 		ck_pr_fence_load();
@ -686,24 +686,24 @@ ck_spinlock_mcs_lock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *nod
 	 * In the case that there is a successor, let them know they must wait
 	 * for us to unlock.
 	 */
-	ck_pr_store_uint(&node->locked, true);
-	ck_pr_store_ptr(&node->next, NULL);
+	node->locked = true;
+	node->next = NULL;
+	ck_pr_fence_store_atomic();

 	/*
 	 * Swap current tail with current lock request. If the swap operation
 	 * returns NULL, it means the queue was empty. If the queue was empty,
 	 * then the operation is complete.
 	 */
-	ck_pr_fence_memory();
 	previous = ck_pr_fas_ptr(queue, node);
-	if (previous == NULL)
-		return;
-
-	/* Let the previous lock holder know that we are waiting on them. */
-	ck_pr_store_ptr(&previous->next, node);
-	while (ck_pr_load_uint(&node->locked) == true)
-		ck_pr_stall();
+	if (previous != NULL) {
+		/* Let the previous lock holder know that we are waiting on them. */
+		ck_pr_store_ptr(&previous->next, node);
+		while (ck_pr_load_uint(&node->locked) == true)
+			ck_pr_stall();
+	}

+	ck_pr_fence_load();
 	return;
 }

@ -712,6 +712,8 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 {
 	struct ck_spinlock_mcs *next;

+	ck_pr_fence_memory();
+
 	next = ck_pr_load_ptr(&node->next);
 	if (next == NULL) {
 		/*
@ -721,7 +723,6 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 		 */
 		if (ck_pr_load_ptr(queue) == node &&
 		    ck_pr_cas_ptr(queue, node, NULL) == true) {
-			ck_pr_fence_memory();
 			return;
 		}

@ -740,9 +741,7 @@ ck_spinlock_mcs_unlock(struct ck_spinlock_mcs **queue, struct ck_spinlock_mcs *n
 	}

 	/* Allow the next lock operation to complete. */
-	ck_pr_fence_memory();
 	ck_pr_store_uint(&next->locked, false);
-
 	return;
 }
 #endif /* CK_F_SPINLOCK_MCS */
--- a/include/gcc/ck_pr.h
+++ b/include/gcc/ck_pr.h
@ -31,9 +31,21 @@
 #error Do not include this file directly, use ck_pr.h
 #endif

+#include <ck_cc.h>
+
+CK_CC_INLINE static void
+ck_pr_barrier(void)
+{
+
+	__asm__ __volatile__("" ::: "memory");
+	return;
+}
+
+#ifndef CK_F_PR
+#define CK_F_PR
+
 #include <stdbool.h>
 #include <ck_stdint.h>
-#include <ck_cc.h>

 /*
 * The following represent supported atomic operations.
@ -93,45 +105,32 @@ ck_pr_stall(void)
 	return;
 }

-/*
- * Most target architectures do not require this.
- */
-CK_CC_INLINE static void
-ck_pr_fence_load_depends(void)
-{
-
-	__sync_synchronize();
-	return;
-}
-
 /*
 * Load and store fences are equivalent to full fences in the GCC port.
 */
 #define CK_PR_FENCE(T)					\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__sync_synchronize();			\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
 	{						\
 		__sync_synchronize();			\
 	}

+CK_PR_FENCE(atomic)
+CK_PR_FENCE(atomic_atomic)
+CK_PR_FENCE(atomic_load)
+CK_PR_FENCE(atomic_store)
+CK_PR_FENCE(store_atomic)
+CK_PR_FENCE(load_atomic)
 CK_PR_FENCE(load)
+CK_PR_FENCE(load_load)
+CK_PR_FENCE(load_store)
 CK_PR_FENCE(store)
+CK_PR_FENCE(store_store)
+CK_PR_FENCE(store_load)
 CK_PR_FENCE(memory)

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic compare and swap.
 */
@ -275,5 +274,5 @@ CK_PR_UNARY_S(8, uint8_t)

 #undef CK_PR_UNARY_S
 #undef CK_PR_UNARY
-
+#endif /* !CK_F_PR */
 #endif /* _CK_PR_GCC_H */
--- a/include/gcc/ppc/ck_pr.h
+++ b/include/gcc/ppc/ck_pr.h
@ -41,6 +41,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * This bounces the hardware thread from low to medium
 * priority. I am unsure of the benefits of this approach
@ -55,45 +60,26 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
-
-CK_PR_FENCE(load_depends, "")
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}
+
+CK_PR_FENCE(atomic, "lwsync")
+CK_PR_FENCE(atomic_store, "lwsync")
+CK_PR_FENCE(atomic_load, "sync")
+CK_PR_FENCE(store_atomic, "lwsync")
+CK_PR_FENCE(load_atomic, "lwsync")
 CK_PR_FENCE(store, "lwsync")
+CK_PR_FENCE(store_load, "sync")
 CK_PR_FENCE(load, "lwsync")
+CK_PR_FENCE(load_store, "lwsync")
 CK_PR_FENCE(memory, "sync")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/ppc64/ck_pr.h
+++ b/include/gcc/ppc64/ck_pr.h
@ -40,6 +40,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * This bounces the hardware thread from low to medium
 * priority. I am unsure of the benefits of this approach
@ -54,49 +59,30 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}

 /*
 * These are derived from:
 *     http://www.ibm.com/developerworks/systems/articles/powerpc.html
 */
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(atomic, "lwsync")
+CK_PR_FENCE(atomic_store, "lwsync")
+CK_PR_FENCE(atomic_load, "sync")
+CK_PR_FENCE(store_atomic, "lwsync")
+CK_PR_FENCE(load_atomic, "lwsync")
 CK_PR_FENCE(store, "lwsync")
+CK_PR_FENCE(store_load, "sync")
 CK_PR_FENCE(load, "lwsync")
+CK_PR_FENCE(load_store, "lwsync")
 CK_PR_FENCE(memory, "sync")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/sparcv9/ck_pr.h
+++ b/include/gcc/sparcv9/ck_pr.h
@ -40,6 +40,11 @@
 */
 #include "ck_f_pr.h"

+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
 /*
 * Order loads at the least.
 */
@ -51,51 +56,30 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
-/*
- * If RMO is forced, then do not assume TSO model.
- */
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }
-#else
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__(I ::: "memory");   \
+	}
+
 /*
- * By default, we will assume TSO model is used on SPARCv9.
+ * Atomic operations are treated as both load and store
+ * operations on SPARCv9.
 */
-#define CK_PR_FENCE(T, I)                               \
-        CK_CC_INLINE static void                        \
-        ck_pr_fence_strict_##T(void)                    \
-        {                                               \
-                __asm__ __volatile__(I ::: "memory");   \
-        }                                               \
-        CK_CC_INLINE static void ck_pr_fence_##T(void)  \
-        {                                               \
-                __asm__ __volatile__("" ::: "memory");  \
-        }
-#endif /* !CK_MD_RMO && !CK_MD_PSO */
-
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(atomic, "membar #StoreStore")
+CK_PR_FENCE(atomic_store, "membar #StoreStore")
+CK_PR_FENCE(atomic_load, "membar #StoreLoad")
+CK_PR_FENCE(store_atomic, "membar #StoreStore")
+CK_PR_FENCE(load_atomic, "membar #LoadStore")
 CK_PR_FENCE(store, "membar #StoreStore")
+CK_PR_FENCE(store_load, "membar #StoreLoad")
 CK_PR_FENCE(load, "membar #LoadLoad")
+CK_PR_FENCE(load_store, "membar #LoadStore")
 CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 #define CK_PR_LOAD(S, M, T, C, I)				\
 	CK_CC_INLINE static T					\
 	ck_pr_load_##S(const M *target)				\
--- a/include/gcc/x86/ck_pr.h
+++ b/include/gcc/x86/ck_pr.h
@ -63,52 +63,26 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
 #define CK_PR_FENCE(T, I)				\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
 	{						\
 		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}
-#else
-/*
- * IA32 has strong memory ordering guarantees, so memory
- * fences are enabled if and only if the user specifies that
- * that the program will be using non-temporal instructions.
- * Otherwise, an optimization barrier is used in order to prevent
- * compiler re-ordering of loads and stores across the barrier.
- */
-#define CK_PR_FENCE(T, I)				\
-	CK_CC_INLINE static void			\
-	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__("" ::: "memory");	\
 	}
-#endif /* !CK_MD_RMO && !CK_MD_PSO */

+CK_PR_FENCE(atomic, "sfence")
+CK_PR_FENCE(atomic_store, "sfence")
+CK_PR_FENCE(atomic_load, "mfence")
+CK_PR_FENCE(store_atomic, "sfence")
+CK_PR_FENCE(load_atomic, "mfence")
 CK_PR_FENCE(load, "lfence")
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(load_store, "mfence")
 CK_PR_FENCE(store, "sfence")
+CK_PR_FENCE(store_load, "mfence")
 CK_PR_FENCE(memory, "mfence")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic fetch-and-store operations.
 */
--- a/include/gcc/x86_64/ck_pr.h
+++ b/include/gcc/x86_64/ck_pr.h
@ -62,52 +62,25 @@ ck_pr_stall(void)
 	return;
 }

-#if defined(CK_MD_RMO) || defined(CK_MD_PSO)
 #define CK_PR_FENCE(T, I)				\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
 	{						\
 		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}
-#else
-/*
- * IA32 has strong memory ordering guarantees, so memory
- * fences are enabled if and only if the user specifies that
- * that the program will be using non-temporal instructions.
- * Otherwise, an optimization barrier is used in order to prevent
- * compiler re-ordering of loads and stores across the barrier.
- */
-#define CK_PR_FENCE(T, I)				\
-	CK_CC_INLINE static void			\
-	ck_pr_fence_strict_##T(void)			\
-	{						\
-		__asm__ __volatile__(I ::: "memory");	\
-	}						\
-	CK_CC_INLINE static void ck_pr_fence_##T(void)	\
-	{						\
-		__asm__ __volatile__("" ::: "memory");	\
 	}
-#endif /* !CK_MD_RMO && !CK_MD_PSO */

+CK_PR_FENCE(atomic_store, "sfence")
+CK_PR_FENCE(atomic_load, "mfence")
+CK_PR_FENCE(store_atomic, "sfence")
+CK_PR_FENCE(load_atomic, "mfence")
 CK_PR_FENCE(load, "lfence")
-CK_PR_FENCE(load_depends, "")
+CK_PR_FENCE(load_store, "mfence")
 CK_PR_FENCE(store, "sfence")
+CK_PR_FENCE(store_load, "mfence")
 CK_PR_FENCE(memory, "mfence")

 #undef CK_PR_FENCE

-CK_CC_INLINE static void
-ck_pr_barrier(void)
-{
-
-	__asm__ __volatile__("" ::: "memory");
-	return;
-}
-
 /*
 * Atomic fetch-and-store operations.
 */
--- a/regressions/ck_pr/benchmark/Makefile
+++ b/regressions/ck_pr/benchmark/Makefile
@ -3,16 +3,16 @@
 all: ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2

 ck_pr_cas_64_2: ck_pr_cas_64_2.c
-	$(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c
+	$(CC) $(CFLAGS) -o ck_pr_cas_64_2 ck_pr_cas_64_2.c -lm

 ck_pr_cas_64: ck_pr_cas_64.c
-	$(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c
+	$(CC) $(CFLAGS) -o ck_pr_cas_64 ck_pr_cas_64.c -lm

 ck_pr_fas_64: ck_pr_fas_64.c
-	$(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c
+	$(CC) $(CFLAGS) -o ck_pr_fas_64 ck_pr_fas_64.c -lm

 clean:
 	rm -rf ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 *.dSYM *.exe

 include ../../../build/regressions.build
-CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm
+CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE
--- a/regressions/ck_spinlock/benchmark/Makefile
+++ b/regressions/ck_spinlock/benchmark/Makefile
@ -14,67 +14,67 @@ OBJECTS=ck_ticket.THROUGHPUT ck_ticket.LATENCY			\
 all: $(OBJECTS)

 ck_spinlock.THROUGHPUT: ck_spinlock.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_spinlock.THROUGHPUT ck_spinlock.c -lm

 ck_spinlock.LATENCY: ck_spinlock.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_spinlock.LATENCY ck_spinlock.c -lm

 ck_ticket.THROUGHPUT: ck_ticket.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket.THROUGHPUT ck_ticket.c -lm

 ck_ticket.LATENCY: ck_ticket.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket.LATENCY ck_ticket.c -lm

 ck_mcs.THROUGHPUT: ck_mcs.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_mcs.THROUGHPUT ck_mcs.c -lm

 ck_mcs.LATENCY: ck_mcs.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_mcs.LATENCY ck_mcs.c -lm

 ck_dec.THROUGHPUT: ck_dec.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_dec.THROUGHPUT ck_dec.c -lm

 ck_dec.LATENCY: ck_dec.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_dec.LATENCY ck_dec.c -lm

 ck_cas.THROUGHPUT: ck_cas.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_cas.THROUGHPUT ck_cas.c -lm

 ck_cas.LATENCY: ck_cas.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_cas.LATENCY ck_cas.c -lm

 ck_fas.THROUGHPUT: ck_fas.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_fas.THROUGHPUT ck_fas.c -lm

 ck_fas.LATENCY: ck_fas.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_fas.LATENCY ck_fas.c -lm

 ck_clh.THROUGHPUT: ck_clh.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_clh.THROUGHPUT ck_clh.c -lm

 ck_clh.LATENCY: ck_clh.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_clh.LATENCY ck_clh.c -lm

 linux_spinlock.THROUGHPUT: linux_spinlock.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o linux_spinlock.THROUGHPUT linux_spinlock.c -lm

 linux_spinlock.LATENCY: linux_spinlock.c
-	$(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c
+	$(CC) -DLATENCY $(CFLAGS) -o linux_spinlock.LATENCY linux_spinlock.c -lm

 ck_ticket_pb.THROUGHPUT: ck_ticket_pb.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_ticket_pb.THROUGHPUT ck_ticket_pb.c -lm

 ck_ticket_pb.LATENCY: ck_ticket_pb.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_ticket_pb.LATENCY ck_ticket_pb.c -lm

 ck_anderson.THROUGHPUT: ck_anderson.c
-	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c
+	$(CC) -DTHROUGHPUT $(CFLAGS) -o ck_anderson.THROUGHPUT ck_anderson.c -lm

 ck_anderson.LATENCY: ck_anderson.c
-	$(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c
+	$(CC) -DLATENCY $(CFLAGS) -o ck_anderson.LATENCY ck_anderson.c -lm

 clean:
 	rm -rf *.dSYM *.exe $(OBJECTS)

 include ../../../build/regressions.build
-CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE -lm
+CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE
--- a/src/ck_epoch.c
+++ b/src/ck_epoch.c
@ -162,6 +162,7 @@ ck_epoch_recycle(struct ck_epoch *global)
 		record = ck_epoch_record_container(cursor);

 		if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) {
+			/* Serialize with respect to deferral list clean-up. */
 			ck_pr_fence_load();
 			state = ck_pr_fas_uint(&record->state, CK_EPOCH_STATE_USED);
 			if (state == CK_EPOCH_STATE_FREE) {
--- a/src/ck_hs.c
+++ b/src/ck_hs.c
@ -421,7 +421,7 @@ restart:
 		 */
 		if (slot != NULL && *slot != CK_HS_EMPTY) {
 			ck_pr_inc_uint(&map->generation[h & CK_HS_G_MASK]);
-			ck_pr_fence_store();
+			ck_pr_fence_atomic_store();
 			ck_pr_store_ptr(slot, CK_HS_TOMBSTONE);
 		}
 	} else {