#include #include #include #include #include #include #include #define GHz 3.40 static inline uint64_t rdtsc() { uint64_t ret; #if defined _LP64 asm volatile ( "rdtsc\n\t" "mov $32, %%rdx\n\t" "orq %%rdx, %%rax\n\t" "mov %%rax, %0\n\t" :"=m"(ret) : :"%rax", "%rdx" ); #else asm volatile ( "rdtsc\n\t" "mov %%eax, %0\n\t" "mov %%edx, %1\n\t" :"=m"(((uint32_t*)&ret)[0]), "=m"(((uint32_t*)&ret)[1]) : :"%eax", "%edx" ); #endif return ret; } void print_GFLOPS(double flops, uint64_t cycles) { double GFLOPS = flops * GHz / cycles; double sec = (double)cycles * 1e-9 / GHz; printf("GFLOPS @ %.2fGHz:\n %.3f [flops/clock] = %.3f [GFLOPS] (%.0f flops in %"PRIu64" clock = %f sec)\n", GHz, flops / (double)cycles, GFLOPS, flops, cycles, sec); } void print_throughput(uint64_t instructions, uint64_t cycles) { printf("Throughput:\n %.3f [instructions/clock] (%"PRIu64" instrucions in %"PRIu64" clock)\n", (double)instructions / (double)cycles, instructions, cycles); } #define zero_all_xmm() \ do { \ asm volatile \ ("xorps %xmm0, %xmm0\n\t" \ "xorps %xmm1, %xmm1\n\t" \ "xorps %xmm2, %xmm2\n\t" \ "xorps %xmm3, %xmm3\n\t" \ "xorps %xmm4, %xmm4\n\t" \ "xorps %xmm5, %xmm5\n\t" \ "xorps %xmm6, %xmm6\n\t" \ "xorps %xmm7, %xmm7\n\t" \ ); \ } while (0) #define LOOP (1 << 21) void sse_mulps_addps_forwarding() { const int flops_per_instruction = 4; const int instructions_per_loop = 8; const double flops = flops_per_instruction * instructions_per_loop * LOOP; uint64_t clk0, clk1, cycles; int i; float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0}; printf("-- sse_mulps_addps_forwarding --\n"); zero_all_xmm(); //IACA_START; clk0 = rdtsc(); for (i = 0; i < LOOP; ++i) { asm volatile ( "mulps %0, %%xmm0\n\t" "addps %%xmm0, %%xmm4\n\t" "mulps %0, %%xmm1\n\t" "addps %%xmm1, %%xmm5\n\t" : :"m"(a[0]) ); } clk1 = rdtsc(); //IACA_END; cycles = clk1 - clk0; print_GFLOPS(flops, cycles); print_throughput(instructions_per_loop * LOOP, cycles); } void sse_mulps_addps_no_dependency() { const int flops_per_instruction = 4; const int instructions_per_loop = 8; const double flops = flops_per_instruction * instructions_per_loop * LOOP; uint64_t clk0, clk1, cycles; int i; printf("-- sse_mulps_addps_no_dependency --\n"); zero_all_xmm(); //IACA_START; clk0 = rdtsc(); for (i = 0; i < LOOP; ++i) { asm volatile ("mulps %xmm0, %xmm1\n\t" "addps %xmm2, %xmm3\n\t" "mulps %xmm4, %xmm5\n\t" "addps %xmm6, %xmm7\n\t" ); } clk1 = rdtsc(); //IACA_END; cycles = clk1 - clk0; print_GFLOPS(flops, cycles); print_throughput(instructions_per_loop * LOOP, cycles); } __attribute__((noinline)) void avx_vmulps_vaddps_forwarding() { const int flops_per_instruction = 4; const int instructions_per_loop = 8; const double flops = flops_per_instruction * instructions_per_loop * LOOP; uint64_t clk0, clk1, cycles; int i; float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0}; printf("-- avx_vmulps_vaddps_forwarding --\n"); zero_all_xmm(); //IACA_START; clk0 = rdtsc(); for (i = 0; i < LOOP; ++i) { asm volatile ( "vmulps %0, %%xmm0, %%xmm4\n\t" "vaddps %%xmm4, %%xmm1, %%xmm1\n\t" "vmulps %0, %%xmm2, %%xmm5\n\t" "vaddps %%xmm5, %%xmm3, %%xmm3\n\t" : :"m"(a[0]) ); } clk1 = rdtsc(); //IACA_END; cycles = clk1 - clk0; print_GFLOPS(flops, cycles); print_throughput(instructions_per_loop * LOOP, cycles); } int main() { sse_mulps_addps_no_dependency(); sse_mulps_addps_forwarding(); avx_vmulps_vaddps_forwarding(); return 0; }