Benchmark functions in <cmath> with GCC and MSVS-CodePudding

I am tasked to benchmark the time cost of almost every function in cmath for 64-bit integer and double. Here is my source code:

#include <unordered_map>
#include <string>
#include <cmath>
#include <cstdint>
#include <vector>
#include <random>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <numeric>


template<typename timetype>
struct tiktok
{
  std::chrono::time_point<std::chrono::steady_clock> start;
  // Return time passed since tik.
  std::size_t tik() { start = std::chrono::steady_clock::now(); return 0; }
  // Return time passed since tok.
  std::size_t tok()
  {
    return std::chrono::duration_cast<timetype> (
        std::chrono::steady_clock::now() - start).count();
  }
};


double mathHspeed(std::unordered_map<std::string, int64_t>& M,
                  int rngSeed, int maxIter = 100000)
{
  std::mt19937 rng(rngSeed);
  std::uniform_real_distribution<double> U(-5, 5);
  std::uniform_int_distribution<int32_t> Uint(-2147483647, 2147483647);
  tiktok<std::chrono::nanoseconds> timer;
  double S = 0;
  int64_t duration = 0;
  maxIter = (maxIter / 2) * 2; // Make sure maxIter is even.
  std::vector<int64_t> u(maxIter);
  std::vector<double> v(maxIter);
  int64_t loadingCost = 0;
  
  
  // Time cost of reading and writing 8-bytes = `loadingCost`
  // Let loadingCost just be 0 since we only need rough numbers.
  if (false)
  {
    for (int i = 0, iend = v.size(); i < iend;   i) v[i] = U(rng);
    
    
    timer.tik();
    for (int i = 0, iend = v.size() - 2; i < iend;   i) v[i]  = v[i   1];
    duration = timer.tok();
    
    
    timer.tik();
    for (int i = 0, iend = v.size() - 2; i < iend;   i) // one more addition.
      v[i]  = v[i   1]   v[i   2];
    std::size_t duration2 = timer.tok();
    
    
    loadingCost = std::max<int64_t>(
      0, (int64_t)duration - ((int64_t)duration2 - (int64_t)duration));
    S  = std::accumulate(v.begin(), v.end(), 0.0);
  }
  
  
#define sampleDouble for(int i = 0, iend = v.size(); i < iend;   i) v[i] = U(rng);
#define sampleInt for(int i = 0, iend = u.size(); i < iend;   i) u[i] = (int64_t)Uint(rng) - Uint(rng);
  
  
  sampleInt; timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend;   i)
    u[i]  = u[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(u.begin(), u.end(), 0.0);
  M["  ( int64 )"] = duration;
  
  
  sampleInt; timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend;   i)
    u[i] *= u[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(u.begin(), u.end(), 0.0);
  M["x ( int64 )"] = duration;
  
  
  for (int i = 0, iend = u.size() - 1; i < iend; i  = 2)
  {
    u[i] = (int64_t)Uint(rng) * Uint(rng); u[i   1] = Uint(rng);
  }
  timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend;   i)
    u[i] = u[i] / u[i   1]   u[i] % u[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["  ( int64 )"], 0);
  S  = std::accumulate(u.begin(), u.end(), 0.0);
  M["/% ( int64 )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i]  = v[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["  ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] *= v[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["x ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] /= v[i   1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["/ ( double )"] = duration;
  
  
  for (int i = 0, iend = u.size(); i < iend;   i) u[i] = (int64_t)Uint(rng) - Uint(rng);
  timer.tik();
  for (int i = 0, iend = u.size(); i < iend;   i)
    u[i] = std::abs(u[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(u.begin(), u.end(), 0.0);
  M["abs ( int64 )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::abs(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["abs ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i  = 2)
  {
    v[i] = U(rng) * U(rng); v[i   1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::fmod(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["fmod ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i  = 2)
  {
    v[i] = U(rng) * U(rng); v[i   1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::remainder(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["remainder ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i  = 2)
  {
    v[i] = U(rng) * U(rng); v[i   1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
  {
    int tmp = 0;
    v[i] = std::remquo(v[i], v[i   1], &tmp)   tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["  ( double )"], 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["remquo ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 2; i < iend;   i)
    v[i] = std::fma(v[i], v[i   1], v[i   2]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["fma ( double, double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::fmax(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["fmax ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::fmin(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["fmin ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::fdim(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["fdim ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::exp(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["exp ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::exp2(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["exp2 ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::expm1(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["expm1 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::log(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["log ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::log10(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["log10 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::log2(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["log2 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::log1p(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["log1p ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::pow(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["pow ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::sqrt(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["sqrt ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::cbrt(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["cbrt ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::hypot(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["hypot ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 2; i < iend;   i)
    v[i] = std::hypot(v[i], v[i   1], v[i   2]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["hypot ( double, double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::sin(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["sin ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::cos(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["cos ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> UhalfPi(-3.14 / 2, 3.14 / 2);
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = UhalfPi(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::tan(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["tan ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> U_11(-0.99, 0.99);
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::asin(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["asin ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] =
    std::acos(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["acos ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::atan(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["atan ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::atan2(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["atan2 ( double, double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::sinh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["sinh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::cosh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["cosh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::tanh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["tanh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::asinh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["asinh ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> U1_10(1.1, 10);
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = U1_10(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::acosh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["acosh ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::atanh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["atanh ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::erf(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["erf ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::erfc(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["erfc ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-3;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::tgamma(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["tgamma ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = std::abs(U(rng))   1e-3;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::lgamma(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["lgamma ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::ceil(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["ceil ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::floor(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["floor ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::trunc(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["trunc ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::round(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["round ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::nearbyint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["nearbyint ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::rint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["rint ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
  {
    int tmp;
    v[i] = std::frexp(v[i], &tmp)   tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["  ( double )"], 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["frexp ( double, int* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
  {
    int tmp = (int)v[i   1];
    v[i] = std::ldexp(v[i], tmp);
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["ldexp ( double, int* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
  {
    double tmp;
    v[i] = std::modf(v[i], &tmp)   tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["modf ( double, double* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::copysign(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["copysign ( double, double )"] = duration;
  
  
  std::uniform_real_distribution<double> betaU(0.001, 30);
  for (int i = 0, iend = v.size(); i < iend;   i) v[i] = betaU(rng);
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend;   i)
    v[i] = std::beta(v[i], v[i   1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["beta ( double, double )"] = duration; // test x, y in [0.001, 30]
  
  
  std::uniform_real_distribution<double> expintU(-30, 30);
  for (int i = 0, iend = v.size(); i < iend;   i)
  {
    v[i] = expintU(rng);
    if (v[i] > -0.01 and v[i] < 0.01) v[i] = 0.01;
  }
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend;   i)
    v[i] = std::expint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S  = std::accumulate(v.begin(), v.end(), 0.0);
  M["expint ( double )"] = duration; // x != 0, test x in [-30, 30] & x != 0
  
  
  // std::uniform_real_distribution<double> riemann_zetaU(-1, 2);
  // for(int i = 0, iend = v.size(); i < iend;   i) v[i] = riemann_zetaU(rng);
  // timer.tik();
  // for(int i = 0, iend = v.size(); i < iend;   i)
  //   v[i] = std::riemann_zeta(v[i]);
  // duration = std::max<int64_t> (timer.tok() - loadingCost, 0);
  // S  = std::accumulate(v.begin(), v.end(), 0.0);
  // M["riemann_zeta ( double )"] = duration; // test x in [-1, 2].
  
  
  return S;
}


int main()
{
  std::cout << "Random seed: ";
  int randomSeed;
  std::cin >> randomSeed;
  std::cout << "Max iteration: ";
  int maxIter;
  std::cin >> maxIter;
  std::cout << "Save file path: ";
  std::string save;
  std::cin >> save;
  
  
  std::unordered_map<std::string, int64_t> M;
  double S = mathHspeed(M, randomSeed, maxIter);
  
  
  // 57.
  constexpr int Nfuns = 56;
  std::string funNames[] = {
    "  ( int64 )", "x ( int64 )", "/% ( int64 )",
    "  ( double )", "x ( double )", "/ ( double )",
    "abs ( int64 )", "abs ( double )", "fmod ( double, double )",
    "remainder ( double, double )", "remquo ( double )",
    "fma ( double, double, double )", "fmax ( double, double )",
    "fmin ( double, double )", "fdim ( double, double )",
    "exp ( double )", "exp2 ( double )", "expm1 ( double )",
    "log ( double )", "log10 ( double )", "log2 ( double )",
    "log1p ( double )", "pow ( double, double )", "sqrt ( double )",
    "cbrt ( double )", "hypot ( double, double )",
    "hypot ( double, double, double )", "sin ( double )",
    "cos ( double )", "tan ( double )", "asin ( double )",
    "acos ( double )", "atan ( double )", "atan2 ( double, double )",
    "sinh ( double )", "cosh ( double )", "tanh ( double )",
    "asinh ( double )", "acosh ( double )", "atanh ( double )",
    "erf ( double )", "erfc ( double )", "tgamma ( double )",
    "lgamma ( double )", "ceil ( double )", "floor ( double )",
    "trunc ( double )", "round ( double )", "nearbyint ( double )",
    "rint ( double )", "frexp ( double, int* )", "ldexp ( double, int* )",
    "modf ( double, double* )", "copysign ( double, double )",
    "beta ( double, double )", "expint ( double )"//, "riemann_zeta ( double )"
  };
  
  
  std::string funNamesNoComma[] = {
    "  ( int64 )", "x ( int64 )", "/% ( int64 )",
    "  ( double )", "x ( double )", "/ ( double )",
    "abs ( int64 )", "abs ( double )", "fmod ( double double )",
    "remainder ( double double )", "remquo ( double )",
    "fma ( double double double )", "fmax ( double double )",
    "fmin ( double double )", "fdim ( double double )",
    "exp ( double )", "exp2 ( double )", "expm1 ( double )",
    "log ( double )", "log10 ( double )", "log2 ( double )",
    "log1p ( double )", "pow ( double double )", "sqrt ( double )",
    "cbrt ( double )", "hypot ( double double )",
    "hypot ( double double double )", "sin ( double )",
    "cos ( double )", "tan ( double )", "asin ( double )",
    "acos ( double )", "atan ( double )", "atan2 ( double double )",
    "sinh ( double )", "cosh ( double )", "tanh ( double )",
    "asinh ( double )", "acosh ( double )", "atanh ( double )",
    "erf ( double )", "erfc ( double )", "tgamma ( double )",
    "lgamma ( double )", "ceil ( double )", "floor ( double )",
    "trunc ( double )", "round ( double )", "nearbyint ( double )",
    "rint ( double )", "frexp ( double int* )", "ldexp ( double int* )",
    "modf ( double double* )", "copysign ( double double )",
    "beta ( double double )", "expint ( double )"//, "riemann_zeta ( double )"
  };
  
  
  double relativeTime[Nfuns];
  for (int i = 0; i < Nfuns;   i)
    relativeTime[i] = std::round(M[funNames[i]] / (M["  ( int64 )"]   0.0) * 10) / 10.0;
  
  
  constexpr int nameWidth = 32;
  constexpr int realtimewd = 16;
  constexpr int relatimewd = 16;
  std::ofstream out(save.c_str());
  out << std::setw(nameWidth) << "Function name,"
      << std::setw(realtimewd) << "Time cost (ms),"
      << std::setw(relatimewd) << "Relative" << std::endl;
  
  
  for (int i = 0; i < Nfuns;   i)
  {
    out << std::setw(nameWidth) << funNamesNoComma[i] << ","
        << std::setw(realtimewd) << std::round(M[funNames[i]] / 1000.0) << ","
        << std::setw(relatimewd) << relativeTime[i] << std::endl;
  }
  
  
  std::cout << "\nDummy sum = " << S << std::endl;
  
  
  return 0;
}

Compile the code using GCC-8.3 (-O0 or -O3), and MSVS Community 2019 (/O2), on a Windows 64-bit laptop with Intel i9-9980, 512KB L1 cache, 2MB L2 cache, 16MB L3 cache, and then input the following parameters:

The table below shows the results:

To be more specific, the optimization menu in MSVS looks like this:

I cannot set "Whole Program Optimization" to Yes because MSVS keeps complaining about "/ZI and /GL incompatible".

My questions:

(1) I am a newbie to the MSVS toolchain. Why is the executable built by MSVS so slow for most functions? MSVS /O2 can not even outperform GCC -O0. How to make MSVS produce equally fast code? I noticed GCC produces a single .exe of about 3MB, but MSVS produces a .exe of about 154KB and a .pdb of 2.9MB.

(2) Interestingly, there are a few functions such as sin(x), cos(x) and exp(x) where MSVS code is much faster. Any reason besides possible different library implementations?

(3) Why on earth is exp2(x) about 5x slower than exp(x) in MSVS? I tried swapping the code blocks of the two functions in the source file. It makes no difference.

Thanks!

CodePudding user response：

To find the bottleneck you have to identify first, where time is wasted (bad code/compile, memory access/throughput). Obviously you try to achieve that already. Keep in mind that profiling itself consume a lot of resources too. MSVC provides built in instruction profiling. This might help to quickly identify instruction related hot spots. You could measure a whole program, or just between 2 break points.

Not sure if whole program optimiation on the screenshot you shared is set to no for a specific reason.

Other performance measurement tools like xperf or various chip manufacturer tools (depending o used hardwarew) can help to measure other resources (memory incl. cache misses, etc).

There are a few compiler settings that can help to optimize for specific scenarios. Anyhow you have to figure out why performance is not optimal.

There are various compiler switches (@njuffa pointed out a few), to modify compiler behavior, like floating point strictness to fp:fast. Appearently you tried those already. /arch allows auto vectorization using the specified instruction set for SSE/AVX(2)/AVX-512. This is CPU dependent, so check supported instruction set first e.g. using CPU-Z (http://www.cpuid.com). This could increase performance by auto parallelization/SIMDfying. You might also want to favor general optimization for a specific CPU instruction set /favor:AMD64, /favor:INTEL64, /favor:ATOM, since that helps the compiler to consider chip specific instruction latency/throughput.

For all mentioned functions exist CPU instructions, so I guess it is implementation depending, as it appears to not be clear for the compiler, as long as the hardware is the same.

You could try another compiler such as LLVM (clang). See here how https://docs.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170 I experienced strenghts and weaknesses for various compilers. For example it seems to be not so easy, getting conditional moves out of MSVC when it is up create pipeline friendly code.