#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <iacaMarks.h>

#define GHz 3.40

static inline uint64_t
rdtsc()
{
  uint64_t ret;
#if defined _LP64
  asm volatile
    (
      "rdtsc\n\t"
      "mov $32, %%rdx\n\t"
      "orq %%rdx, %%rax\n\t"
      "mov %%rax, %0\n\t"
      :"=m"(ret)
      :
      :"%rax", "%rdx"
    );
#else
  asm volatile
    (
      "rdtsc\n\t"
      "mov %%eax, %0\n\t"
      "mov %%edx, %1\n\t"
      :"=m"(((uint32_t*)&ret)[0]), "=m"(((uint32_t*)&ret)[1])
      :
      :"%eax", "%edx"
    );
#endif
  return ret;
}

  void
print_GFLOPS(double flops, uint64_t cycles)
{
  double GFLOPS = flops * GHz / cycles;
  double sec = (double)cycles * 1e-9 / GHz;
  printf("GFLOPS @ %.2fGHz:\n  %.3f [flops/clock] = %.3f [GFLOPS]  (%.0f flops in %"PRIu64" clock = %f sec)\n",
         GHz, flops / (double)cycles, GFLOPS, flops, cycles, sec);
}

  void
print_throughput(uint64_t instructions, uint64_t cycles)
{
  printf("Throughput:\n  %.3f [instructions/clock]   (%"PRIu64" instrucions in %"PRIu64" clock)\n",
         (double)instructions / (double)cycles, instructions, cycles);
}

#define zero_all_xmm() \
  do { \
    asm volatile \
    ("xorps %xmm0, %xmm0\n\t" \
     "xorps %xmm1, %xmm1\n\t" \
     "xorps %xmm2, %xmm2\n\t" \
     "xorps %xmm3, %xmm3\n\t" \
     "xorps %xmm4, %xmm4\n\t" \
     "xorps %xmm5, %xmm5\n\t" \
     "xorps %xmm6, %xmm6\n\t" \
     "xorps %xmm7, %xmm7\n\t" \
    ); \
  } while (0)

#define LOOP (1 << 21)

void
sse_mulps_addps_forwarding()
{
  const int flops_per_instruction = 4;
  const int instructions_per_loop = 8;
  const double flops = flops_per_instruction * instructions_per_loop * LOOP;
  uint64_t clk0, clk1, cycles;
  int i;
  float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0};

  printf("-- sse_mulps_addps_forwarding --\n");

  zero_all_xmm();
//IACA_START;
  clk0 = rdtsc();
  for (i = 0; i < LOOP; ++i) {
    asm volatile
      (
        "mulps %0, %%xmm0\n\t"
        "addps %%xmm0, %%xmm4\n\t"
        "mulps %0, %%xmm1\n\t"
        "addps %%xmm1, %%xmm5\n\t"
        :
        :"m"(a[0])
      );
  }
  clk1 = rdtsc();
//IACA_END;
  cycles = clk1 - clk0;
  print_GFLOPS(flops, cycles);
  print_throughput(instructions_per_loop * LOOP, cycles);
}

void
sse_mulps_addps_no_dependency()
{
  const int flops_per_instruction = 4;
  const int instructions_per_loop = 8;
  const double flops = flops_per_instruction * instructions_per_loop * LOOP;
  uint64_t clk0, clk1, cycles;
  int i;

  printf("-- sse_mulps_addps_no_dependency --\n");

  zero_all_xmm();
//IACA_START;
  clk0 = rdtsc();
  for (i = 0; i < LOOP; ++i) {
    asm volatile
      ("mulps %xmm0, %xmm1\n\t"
       "addps %xmm2, %xmm3\n\t"
       "mulps %xmm4, %xmm5\n\t"
       "addps %xmm6, %xmm7\n\t"
      );
  }
  clk1 = rdtsc();
//IACA_END;
  cycles = clk1 - clk0;
  print_GFLOPS(flops, cycles);
  print_throughput(instructions_per_loop * LOOP, cycles);
}
__attribute__((noinline))
void
avx_vmulps_vaddps_forwarding()
{
  const int flops_per_instruction = 4;
  const int instructions_per_loop = 8;
  const double flops = flops_per_instruction * instructions_per_loop * LOOP;
  uint64_t clk0, clk1, cycles;
  int i;
  float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0};

  printf("-- avx_vmulps_vaddps_forwarding --\n");

  zero_all_xmm();
//IACA_START;
  clk0 = rdtsc();
  for (i = 0; i < LOOP; ++i) {
    asm volatile
      (
        "vmulps %0, %%xmm0, %%xmm4\n\t"
        "vaddps %%xmm4, %%xmm1, %%xmm1\n\t"
        "vmulps %0, %%xmm2, %%xmm5\n\t"
        "vaddps %%xmm5, %%xmm3, %%xmm3\n\t"
        :
        :"m"(a[0])
      );
  }
  clk1 = rdtsc();
//IACA_END;
  cycles = clk1 - clk0;
  print_GFLOPS(flops, cycles);
  print_throughput(instructions_per_loop * LOOP, cycles);
}

int
main()
{
  sse_mulps_addps_no_dependency();
  sse_mulps_addps_forwarding();
  avx_vmulps_vaddps_forwarding();
  return 0;
}