Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Question] Unexpected performance behavior #1081

Open
clay4megtr opened this issue Jan 20, 2025 · 2 comments
Open

[Question] Unexpected performance behavior #1081

clay4megtr opened this issue Jan 20, 2025 · 2 comments

Comments

@clay4megtr
Copy link

test code:

void mean(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res) {
  std::size_t size = res.size();
  for (std::size_t i = 0; i < size; ++i) {
    res[i] = (a[i] + b[i]) / 2;
  }
}

void meanAVXUnaligned(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res) {
  using b_type = xsimd::batch<double, xsimd::avx>;
  std::size_t inc = b_type::size;  // step.
  std::size_t size = res.size();
  std::size_t vec_size = size - (size % inc);

  for (size_t i = 0; i < vec_size; i+= inc) {
    b_type avec = b_type::load_unaligned(&a[i]); // load
    b_type bvec = b_type::load_unaligned(&b[i]);
    b_type resv = (avec + bvec) / 2; // compute
    resv.store_unaligned(&res[i]);   // store
  }
  // Remaining part that cannot be vectorize
  for (std::size_t i = vec_size; i < size; ++i) {
      res[i] = (a[i] + b[i]) / 2;
  }
}

using vector_type = std::vector<double, xsimd::aligned_allocator<double>>;
void meanAVXAligned(const vector_type& a, const vector_type& b, vector_type& res) {
  using b_type = xsimd::batch<double, xsimd::avx>;
  std::size_t inc = b_type::size;
  std::size_t size = res.size();
  // size for which the vectorization is possible
  std::size_t vec_size = size - size % inc;
  for (std::size_t i = 0; i < vec_size; i += inc) {
      b_type avec = b_type::load_aligned(&a[i]);
      b_type bvec = b_type::load_aligned(&b[i]);
      b_type rvec = (avec + bvec) / 2;
      rvec.store_aligned(&res[i]);
  }
  // Remaining part that cannot be vectorize
  for (std::size_t i = vec_size; i < size; ++i) {
      res[i] = (a[i] + b[i]) / 2;
  }
}

template<bool enable = false>
void benchVectorMean(benchmark::State& state) {
  size_t count = state.range(0);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<> dis(0.0, 10000.0);

  std::vector<double> a;
  std::vector<double> b;
  std::vector<double> c;
  for (size_t i = 0; i < count; i++) {
    a.push_back(dis(gen));
    b.push_back(dis(gen));
    c.push_back(0); // init
  }

  for (auto _: state) {
    if constexpr (enable) {
      meanAVXUnaligned(a, b, c);
    } else {
      mean(a, b, c);
    }
  }
}
BENCHMARK_TEMPLATE(benchVectorMean)->RangeMultiplier(8)->Range(800, 51200);
BENCHMARK_TEMPLATE(benchVectorMean, true)->RangeMultiplier(8)->Range(800, 51200);

void benchVectorMeanAligned(benchmark::State& state) {
  size_t count = state.range(0);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<> dis(0.0, 10000.0);

  std::vector<double, xsimd::aligned_allocator<double>> a;
  std::vector<double, xsimd::aligned_allocator<double>> b;
  std::vector<double, xsimd::aligned_allocator<double>> c;
  for (size_t i = 0; i < count; i++) {
    a.push_back(dis(gen));
    b.push_back(dis(gen));
    c.push_back(0); // init
  }

  for (auto _: state) {
    meanAVXAligned(a, b, c);
  }
}
BENCHMARK(benchVectorMeanAligned)->RangeMultiplier(8)->Range(800, 51200);

test tesult:

Image

DId I do something wrong?

@serge-sans-paille
Copy link
Contributor

Can you share your compiler invocation?

@serge-sans-paille
Copy link
Contributor

% clang++ -DNDEBUG -mavx bench.cpp -o bench -O2 -Iinclude -lbenchmark
ssp@lakota:~/sources/xsimd (fix/rvv-cast)% ./bench
2025-01-20T21:59:56+01:00
Running ./bench
Run on (20 X 4900 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x10)
  L1 Instruction 32 KiB (x10)
  L2 Unified 1280 KiB (x10)
  L3 Unified 24576 KiB (x1)
Load Average: 2.37, 3.46, 3.16
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-----------------------------------------------------------------------
Benchmark                             Time             CPU   Iterations
-----------------------------------------------------------------------
benchVectorMean<>/800               110 ns          110 ns      6483429
benchVectorMean<>/4096             1085 ns         1083 ns       645867
benchVectorMean<>/32768            8824 ns         8812 ns        79532
benchVectorMean<>/51200           13954 ns        13936 ns        48260
benchVectorMean<true>/800           118 ns          118 ns      5916503
benchVectorMean<true>/4096         1071 ns         1069 ns       653371
benchVectorMean<true>/32768        8769 ns         8755 ns        79350
benchVectorMean<true>/51200       13816 ns        13793 ns        49613
benchVectorMeanAligned/800          119 ns          119 ns      5860759
benchVectorMeanAligned/4096        1066 ns         1064 ns       657942
benchVectorMeanAligned/32768       8762 ns         8748 ns        80568
benchVectorMeanAligned/51200      13973 ns        13951 ns        51240

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants