In C/C++ stdatomics on x86 provides release-acquire semantic which is free due to x86 memory model. So the point of using atomic_load_explicit(&i, memory_order_acquire); and atomic_store_explicit(&i, 0, memory_order_release); is that it does not require any memory barrier instruction to ensure correct release-acquire semantics on x86.
In Java there's also VarHandle::getAcquire and VarHandle::setRelease which roughly speaking provides the same memory semantic. The problem is when trying to measure VarHandle::getAcquire I figured out that it introduces a lot of overhead which essentially destroys the whole point of the optimization. Here is the benchmark:
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(value = 1)
@BenchmarkMode(org.openjdk.jmh.annotations.Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class Benchmarks {
private int variable = 0;
private static VarHandle VAR_HANDLE;
static {
try {
MethodHandles.Lookup l = MethodHandles.lookup();
VAR_HANDLE = l.findVarHandle(Benchmarks.class, "variable", int.class);
} catch (ReflectiveOperationException e) {
throw new Error(e);
}
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void readAcquire(Blackhole bh) {
bh.consume(VAR_HANDLE.getAcquire(this));
}
}
so prof perfasm shows pretty much lot of noise like:
mov 0x20(%r12,%r8,8),%r9d ;*aaload {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.VarHandle::getMethodHandle@10 (line 1979)
; - java.lang.invoke.VarHandleGuards::guard_L_L@50 (line 40)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
mov 0x10(%r12,%r9,8),%ecx ;*getfield type {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asType@2 (line 839)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
; implicit exception: dispatches to 0x00007f50282ef40c
cmp $0xe2d7411d,%ecx ; {oop(a 'java/lang/invoke/MethodType'{0x0000000716ba08e8} = (Ljava/lang/invoke/VarHandle;Lcom/test/Benchmarks;)Ljava/lang/Object;)}
je 0x7f50282ef318 ;*if_acmpne {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asType@5 (line 839)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
mov 0x18(%r12,%r9,8),%r10d ;*getfield asTypeCache {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asTypeCached@1 (line 851)
; - java.lang.invoke.MethodHandle::asType@12 (line 843)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
mov 0x10(%r12,%r10,8),%r8d ;*getfield type {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asTypeCached@11 (line 852)
; - java.lang.invoke.MethodHandle::asType@12 (line 843)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
; implicit exception: dispatches to 0x00007f50282ef428
cmp $0xe2d7411d,%r8d ; {oop(a 'java/lang/invoke/MethodType'{0x0000000716ba08e8} = (Ljava/lang/invoke/VarHandle;Lcom/test/Benchmarks;)Ljava/lang/Object;)}
jne 0x7f50282ef2e4 ;*if_acmpne {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asTypeCached@14 (line 852)
; - java.lang.invoke.MethodHandle::asType@12 (line 843)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
lea (%r12,%r11,8),%rdx ;*getstatic VAR_HANDLE {reexecute=0 rethrow=0 return_oop=0}
; - com.test.Benchmarks::readAcquire@1 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
lea (%r12,%r10,8),%rsi ;*getfield asTypeCache {reexecute=0 rethrow=0 return_oop=0}
; - java.lang.invoke.MethodHandle::asTypeCached@1 (line 851)
; - java.lang.invoke.MethodHandle::asType@12 (line 843)
; - java.lang.invoke.VarHandleGuards::guard_L_L@59 (line 41)
; - com.test.Benchmarks::readAcquire@5 (line 33)
; - com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
or
callq 0x7fa21883fd80
;*invokevirtual invokeBasic {reexecute=0 rethrow=0 return_oop=1}
;java.lang.invoke.VarHandleGuards::guard_L_L@64 (line 41)
;com.test.Benchmarks::readAcquire@5 (line 32)
;com.test.jmh_generated.Benchmarks_readAcquire_jmhTest::readAcquire_avgt_jmhStub@17 (line 190)
where the VarHandleGuard::guard__L looks as
static final Object guard__L(VarHandle handle, VarHandle.AccessDescriptor ad) throws Throwable {
handle.checkExactAccessMode(ad);
if (handle.isDirect() && handle.vform.methodType_table[ad.type] == ad.symbolicMethodTypeErased) {
Object r = MethodHandle.linkToStatic(handle, handle.vform.getMemberName(ad.mode));
return ad.returnType.cast(r);
} else {
MethodHandle mh = handle.getMethodHandle(ad.mode);
return mh.asType(ad.symbolicMethodTypeInvoker).invokeBasic(handle.asDirect());
}
}
which seems to be called implicitly on any VarHandle invocation.
In Java, volatile provides sequential-consistency guarantees, and essentially reading a volatile variable does not require extra memory barrier on x86 (unlike other ISAs), but writing to a volatile variable usually compiles to an extra lock addl $0x0, (%rsp) inserted after memory write instruction. (C++ compilers use xchg instead of mov for seq_cst stores, instead of a separate mfence or locked instruction. Slightly cheaper, but the full barrier is the main cost.)
Why does VarHandle::getAcquire introduce so much noise and is it possible to use it instead of volatile for optimizing for latency?