- memmove()/bzero() measurements represent how well the OS (C library) optimized memmove()/bzero() functions for different CPU architectures.
- rep movsb/stosb measurements represent how well the ERMS feature is implemented on different CPU architectures. Intel only.
- temporal load/store measurements represent how well memory load/store (read/write) perform through memory cache.
- non-temporal (aka streaming) load/store measurements represent how well memory load/store (read/write) perform without going through memory cache.
pre-SSE4.1:
Code: Select all
// temporal block read
void *tbread(void *dst, const void *src, size_t bs)
{
__m128 x0, x1, x2, x3;
__asm__ __volatile__ (
"BREAD1_%=:\n"
// temporal load. cache pollution.
"movaps (%0), %2\n"
"movaps 0x10(%0), %3\n"
"movaps 0x20(%0), %4\n"
"movaps 0x30(%0), %5\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
: "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
: /* clobbered */
);
return dst;
}
// temporal block zero fill
void *tbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__m128 x0;
__asm__ __volatile__ (
"xorps %2, %2\n"
"BZERO1_%=:\n"
// temporal store. cache pollution.
"movaps %2, (%0)\n"
"movaps %2, 0x10(%0)\n"
"movaps %2, 0x20(%0)\n"
"movaps %2, 0x30(%0)\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
: "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
: "memory" /* clobbered */
);
return dst;
}
// non-temporal block read
void *ntbread(void *dst, const void *src, size_t bs)
{
__m128 x0, x1, x2, x3;
__asm__ __volatile__ (
"BREAD1_%=:\n"
// non-temporal load is not available in SSE2. using movaps (temporal load).
// NOTE: temporal load. cache pollution.
"movaps (%0), %2\n"
"movaps 0x10(%0), %3\n"
"movaps 0x20(%0), %4\n"
"movaps 0x30(%0), %5\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
: "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
: /* clobbered */
);
return dst;
}
// non-temporal block zero fill
void *ntbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__m128 x0;
__asm__ __volatile__ (
"xorps %2, %2\n"
"BZERO1_%=:\n"
// movntdq (SSE2): Non-temporal store of double quadword from an XMM register into memory
// non-temporal store. no cache pollution.
"movntdq %2, (%0)\n"
"movntdq %2, 0x10(%0)\n"
"movntdq %2, 0x20(%0)\n"
"movntdq %2, 0x30(%0)\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
: "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
: "memory" /* clobbered */
);
return dst;
}
Code: Select all
// non-temporal block read (non-temporal load is available on SSE4.1 and up)
void *ntbread_sse41(void *dst, const void *src, size_t bs)
{
//
// movntdqa (SSE4.1): Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type.
//
__m128 x0, x1, x2, x3;
__asm__ __volatile__ (
"BREAD1_%=:\n"
// non-temporal load. no cache pollution.
"movntdqa (%0), %2\n"
"movntdqa 0x10(%0), %3\n"
"movntdqa 0x20(%0), %4\n"
"movntdqa 0x30(%0), %5\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
: "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
: /* clobbered */
);
return dst;
}
// non-temporal block zero fill
void *ntbzero_sse41(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
//
// movntdqa (SSE4.1): Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type.
// movntdq (SSE2): Move packed integer values in xmm1 to m128 using non-temporal hint.
//
// MOVNTDQA: Provides a non-temporal hint that can cause adjacent 16-byte items
// within an aligned 64-byte region (a streaming line) to be fetched and held in
// a small set of temporary buffers (“streaming load buffers”). Subsequent
// streaming loads to other aligned 16-byte items in the same streaming line may
// be supplied from the streaming load buffer and can improve throughput.
//
void *p = dst;
__m128 x0;
__asm__ __volatile__ (
"xorps %2, %2\n"
"BZERO1_%=:\n"
// non-temporal store. no cache pollution.
"movntdq %2, (%0)\n"
"movntdq %2, 0x10(%0)\n"
"movntdq %2, 0x20(%0)\n"
"movntdq %2, 0x30(%0)\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
: "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
: "memory" /* clobbered */
);
return dst;
}
Code: Select all
// temporal block read
void *tbread_avx2(void *dst, const void *src, size_t bs)
{
__m256 y0, y1;
__asm__ __volatile__ (
"BREAD1_%=:\n"
// temporal load. cache pollution.
"vmovaps (%0), %2\n"
"vmovaps 0x20(%0), %3\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(y1) /* %3 */ /* output */
: "0"(src) /* %4 */, "1"(bs) /* %5 */ /* input */
: /* clobbered */
);
return dst;
}
// temporal block zero fill
void *tbzero_avx2(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__m128 x0;
__m256 y0;
__asm__ __volatile__ (
//
// vmovaps (AVX): Move aligned packed single-precision floating-point values from ymm2/mem to ymm1.
//
"xorps %3, %3\n"
"vbroadcastss %3, %2\n"
"BZERO1_%=:\n"
// temporal store. cache pollution.
"vmovaps %2, (%0)\n"
"vmovaps %2, 0x20(%0)\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(x0) /* %3 */ /* output */
: "0"(p) /* %4 */, "1"(bs) /* %5 */ /* input */
: "memory" /* clobbered */
);
return dst;
}
// non-temporal block read (non-temporal 256-bit load is available on AVX2 and up)
void *ntbread_avx2(void *dst, const void *src, size_t bs)
{
__m256 y0, y1;
__asm__ __volatile__ (
"BREAD1_%=:\n"
// non-temporal load. no cache pollution.
"vmovntdqa (%0), %2\n"
"vmovntdqa 0x20(%0), %3\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(y1) /* %3 */ /* output */
: "0"(src) /* %4 */, "1"(bs) /* %5 */ /* input */
: /* clobbered */
);
return dst;
}
// non-temporal block zero fill
void *ntbzero_avx2(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__m128 x0;
__m256 y0;
__asm__ __volatile__ (
//
// vmovntdq (AVX): Move packed integer values in ymm1 to m256 using non-temporal hint.
//
"xorps %3, %3\n"
"vbroadcastss %3, %2\n"
"BZERO1_%=:\n"
// non-temporal store. no cache pollution.
"vmovntdq %2, (%0)\n"
"vmovntdq %2, 0x20(%0)\n"
"addq $0x40, %0\n"
"subq $0x40, %1\n"
"ja BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(x0) /* %3 */ /* output */
: "0"(p) /* %4 */, "1"(bs) /* %5 */ /* input */
: "memory" /* clobbered */
);
return dst;
}
Code: Select all
// temporal block read
void *tbread(void *dst, const void *src, size_t bs)
{
__asm__ __volatile__ (
"BREAD1_%=:\n"
"ldp q0, q1, [%0]\n"
"ldp q2, q3, [%0, #0x20]\n"
"add %0, %0, #0x40\n"
"subs %1, %1, #0x40\n"
"b.hi BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */ /* output */
: "0"(src) /* %0 */, "1"(bs) /* %1 */ /* input */
: "v0", "v1", "v2", "v3" /* clobbered */
);
return dst;
}
// non-temporal block read
void *ntbread(void *dst, const void *src, size_t bs)
{
__asm__ __volatile__ (
"BREAD1_%=:\n"
"ldnp q0, q1, [%0]\n"
"ldnp q2, q3, [%0, #0x20]\n"
"add %0, %0, #0x40\n"
"subs %1, %1, #0x40\n"
"b.hi BREAD1_%=\n"
: "=r"(src) /* %0 */, "=r"(bs) /* %1 */ /* output */
: "0"(src) /* %0 */, "1"(bs) /* %1 */ /* input */
: "v0", "v1", "v2", "v3" /* clobbered */
);
return dst;
}
// temporal block write
void *tbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__asm__ __volatile__ (
// STP: Store Pair of Registers.
"eor.16b v0, v0, v0\n"
"BZERO1_%=:\n"
// temporal store. cache pollution.
"stp q0, q0, [%0]\n"
"stp q0, q0, [%0, #0x20]\n"
"add %0, %0, #0x40\n"
"subs %1, %1, #0x40\n"
"b.hi BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */ /* output */
: "0"(p) /* %0 */, "1"(bs) /* %1 */ /* input */
: "v0", "memory" /* clobbered */
);
return dst;
}
// non-temporal block write
void *ntbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
void *p = dst;
__asm__ __volatile__ (
// STNP: Store Pair of Registers, with non-temporal hint.
"eor.16b v0, v0, v0\n"
"BZERO1_%=:\n"
// non-temporal store. no cache pollution.
"stnp q0, q0, [%0]\n"
"stnp q0, q0, [%0, #0x20]\n"
"add %0, %0, #0x40\n"
"subs %1, %1, #0x40\n"
"b.hi BZERO1_%=\n"
: "=r"(p) /* %0 */, "=r"(bs) /* %1 */ /* output */
: "0"(p) /* %0 */, "1"(bs) /* %1 */ /* input */
: "v0", "memory" /* clobbered */
);
return dst;
}