/* Test with something like: gcc-4.9.0 -std=c11 -O3 test.c -S -o test_u.s sed 's/movdqu/movdqa/' test_u.s > test_a.s gcc-4.9.0 test_u.s -o test_u gcc-4.9.0 test_a.s -o test_a perf stat -r4 ./test_u 1013.003257 task-clock # 0.997 CPUs utilized ( +- 0.01% ) 2,188,597,489 cycles # 2.161 GHz ( +- 0.01% ) perf stat -r4 ./test_a 513.658041 task-clock # 0.997 CPUs utilized ( +- 0.05% ) 1,109,852,134 cycles # 2.161 GHz ( +- 0.05% ) (on a Pentium Dual-Core T3400) */ #define _GNU_SOURCE #include #include #include #include #include static uint32_t __attribute__((noinline)) f(__m128i *p, size_t sz) { __m128i t = _mm_setzero_si128(); #define UNROLL 4 for (size_t i = 0; i < sz/16/UNROLL; ++i) for (size_t j = 0; j < UNROLL; ++j) t = _mm_add_epi32(t, _mm_loadu_si128(p++)); uint32_t temp[4]; _mm_storeu_si128((__m128i *)temp, t); return temp[0] + temp[1] + temp[2] + temp[3]; } int main() { void *p; size_t sz = 1024*16; if (posix_memalign(&p, 4096, sz)) abort(); memset(p, 0, sz); uint32_t n = 0; for (int i = 0; i < 1024*1024; ++i) n += f(p, sz); printf("%d\n", n); }