[Question] Unexpected performance behavior #1081

clay4megtr · 2025-01-20T13:43:23Z

test code:

void mean(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res) {
  std::size_t size = res.size();
  for (std::size_t i = 0; i < size; ++i) {
    res[i] = (a[i] + b[i]) / 2;
  }
}

void meanAVXUnaligned(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res) {
  using b_type = xsimd::batch<double, xsimd::avx>;
  std::size_t inc = b_type::size;  // step.
  std::size_t size = res.size();
  std::size_t vec_size = size - (size % inc);

  for (size_t i = 0; i < vec_size; i+= inc) {
    b_type avec = b_type::load_unaligned(&a[i]); // load
    b_type bvec = b_type::load_unaligned(&b[i]);
    b_type resv = (avec + bvec) / 2; // compute
    resv.store_unaligned(&res[i]);   // store
  }
  // Remaining part that cannot be vectorize
  for (std::size_t i = vec_size; i < size; ++i) {
      res[i] = (a[i] + b[i]) / 2;
  }
}

using vector_type = std::vector<double, xsimd::aligned_allocator<double>>;
void meanAVXAligned(const vector_type& a, const vector_type& b, vector_type& res) {
  using b_type = xsimd::batch<double, xsimd::avx>;
  std::size_t inc = b_type::size;
  std::size_t size = res.size();
  // size for which the vectorization is possible
  std::size_t vec_size = size - size % inc;
  for (std::size_t i = 0; i < vec_size; i += inc) {
      b_type avec = b_type::load_aligned(&a[i]);
      b_type bvec = b_type::load_aligned(&b[i]);
      b_type rvec = (avec + bvec) / 2;
      rvec.store_aligned(&res[i]);
  }
  // Remaining part that cannot be vectorize
  for (std::size_t i = vec_size; i < size; ++i) {
      res[i] = (a[i] + b[i]) / 2;
  }
}

template<bool enable = false>
void benchVectorMean(benchmark::State& state) {
  size_t count = state.range(0);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<> dis(0.0, 10000.0);

  std::vector<double> a;
  std::vector<double> b;
  std::vector<double> c;
  for (size_t i = 0; i < count; i++) {
    a.push_back(dis(gen));
    b.push_back(dis(gen));
    c.push_back(0); // init
  }

  for (auto _: state) {
    if constexpr (enable) {
      meanAVXUnaligned(a, b, c);
    } else {
      mean(a, b, c);
    }
  }
}
BENCHMARK_TEMPLATE(benchVectorMean)->RangeMultiplier(8)->Range(800, 51200);
BENCHMARK_TEMPLATE(benchVectorMean, true)->RangeMultiplier(8)->Range(800, 51200);

void benchVectorMeanAligned(benchmark::State& state) {
  size_t count = state.range(0);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<> dis(0.0, 10000.0);

  std::vector<double, xsimd::aligned_allocator<double>> a;
  std::vector<double, xsimd::aligned_allocator<double>> b;
  std::vector<double, xsimd::aligned_allocator<double>> c;
  for (size_t i = 0; i < count; i++) {
    a.push_back(dis(gen));
    b.push_back(dis(gen));
    c.push_back(0); // init
  }

  for (auto _: state) {
    meanAVXAligned(a, b, c);
  }
}
BENCHMARK(benchVectorMeanAligned)->RangeMultiplier(8)->Range(800, 51200);

test tesult:

DId I do something wrong?

The text was updated successfully, but these errors were encountered:

serge-sans-paille · 2025-01-20T15:18:56Z

Can you share your compiler invocation?

serge-sans-paille · 2025-01-20T21:00:26Z

% clang++ -DNDEBUG -mavx bench.cpp -o bench -O2 -Iinclude -lbenchmark
ssp@lakota:~/sources/xsimd (fix/rvv-cast)% ./bench
2025-01-20T21:59:56+01:00
Running ./bench
Run on (20 X 4900 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x10)
  L1 Instruction 32 KiB (x10)
  L2 Unified 1280 KiB (x10)
  L3 Unified 24576 KiB (x1)
Load Average: 2.37, 3.46, 3.16
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-----------------------------------------------------------------------
Benchmark                             Time             CPU   Iterations
-----------------------------------------------------------------------
benchVectorMean<>/800               110 ns          110 ns      6483429
benchVectorMean<>/4096             1085 ns         1083 ns       645867
benchVectorMean<>/32768            8824 ns         8812 ns        79532
benchVectorMean<>/51200           13954 ns        13936 ns        48260
benchVectorMean<true>/800           118 ns          118 ns      5916503
benchVectorMean<true>/4096         1071 ns         1069 ns       653371
benchVectorMean<true>/32768        8769 ns         8755 ns        79350
benchVectorMean<true>/51200       13816 ns        13793 ns        49613
benchVectorMeanAligned/800          119 ns          119 ns      5860759
benchVectorMeanAligned/4096        1066 ns         1064 ns       657942
benchVectorMeanAligned/32768       8762 ns         8748 ns        80568
benchVectorMeanAligned/51200      13973 ns        13951 ns        51240

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Question] Unexpected performance behavior #1081

[Question] Unexpected performance behavior #1081

clay4megtr commented Jan 20, 2025

serge-sans-paille commented Jan 20, 2025

serge-sans-paille commented Jan 20, 2025

[Question] Unexpected performance behavior #1081

[Question] Unexpected performance behavior #1081

Comments

clay4megtr commented Jan 20, 2025

serge-sans-paille commented Jan 20, 2025

serge-sans-paille commented Jan 20, 2025