Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Improve AVX, make detection a little nicer #865

Merged
merged 2 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install ${{ matrix.cfg.cpp-version }} libsodium-dev libopus-dev zlib1g-dev rpm

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -DCMAKE_BUILD_TYPE=Release ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -DCMAKE_BUILD_TYPE=Release ..
env:
CXX: ${{matrix.cfg.cpp-version}}

Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install ${{ matrix.cfg.cpp-version }} libsodium-dev libopus-dev zlib1g-dev rpm

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -DCMAKE_BUILD_TYPE=Release ${{matrix.cfg.cmake-flags}} ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -DCMAKE_BUILD_TYPE=Release ${{matrix.cfg.cmake-flags}} ..
env:
CXX: ${{matrix.cfg.cpp-version}}

Expand Down Expand Up @@ -131,7 +131,7 @@ jobs:
run: brew install cmake make libsodium opus openssl

- name: Generate CMake
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DDPP_CORO=ON -DAVX_TYPE=T_fallback ..
run: mkdir build && cd build && cmake -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DDPP_CORO=ON -DAVX_TYPE=AVX0 ..
env:
DONT_RUN_VCPKG: true

Expand Down Expand Up @@ -175,13 +175,13 @@ jobs:

- name: Generate CMake (x64)
if: ${{ matrix.cfg.arch == 'x64' }}
run: mkdir main/build && cd main/build && cmake -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback ..
run: mkdir main/build && cd main/build && cmake -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 ..
env:
DONT_RUN_VCPKG: true

- name: Generate CMake (x86)
if: ${{ matrix.cfg.arch == 'x86' }}
run: mkdir main/build && cd main/build && cmake -DCMAKE_TOOLCHAIN_FILE="cmake\Win32Toolchain.cmake" -DDPP_NO_VCPKG=ON -DAVX_TYPE=T_fallback -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -A Win32 -T host=x86 ..
run: mkdir main/build && cd main/build && cmake -DCMAKE_TOOLCHAIN_FILE="cmake\Win32Toolchain.cmake" -DDPP_NO_VCPKG=ON -DAVX_TYPE=AVX0 -G "Visual Studio ${{matrix.cfg.vsv}} ${{matrix.cfg.vs}}" -A Win32 -T host=x86 ..
env:
DONT_RUN_VCPKG: true

Expand Down Expand Up @@ -230,7 +230,7 @@ jobs:
run: sudo sed -i 's/azure\.//' /etc/apt/sources.list && sudo apt update && sudo apt install cmake rpm

- name: Generate CMakeFiles
run: mkdir build && cd build && sudo cmake ${{matrix.cfg.cmake-options}} -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DAVX_TYPE=T_fallback ..
run: mkdir build && cd build && sudo cmake ${{matrix.cfg.cmake-options}} -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release -DAVX_TYPE=AVX0 ..

- name: Compile Source
run: cd build && sudo make -j2
Expand Down Expand Up @@ -266,7 +266,7 @@ jobs:
# ls -lah
# mkdir build
# cd build
# cmake -DAVX_TYPE=T_fallback -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release ..
# cmake -DAVX_TYPE=AVX0 -DDPP_NO_VCPKG=ON -DCMAKE_BUILD_TYPE=Release ..
# make -j2
# make install
# cpack --verbose
Expand Down
30 changes: 17 additions & 13 deletions cmake/DetectArchitecture.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,30 @@ function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUC
if(${INSTRUCTION_SET_NAME})
set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE)
set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
else()
return()
endif()
endfunction()

if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set(INSTRUCTION_SETS
"T_AVX?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"T_AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"T_AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
"AVX1?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
)
else()
set(INSTRUCTION_SETS
"T_AVX?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"T_AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"T_AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
"AVX1?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
"AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
"AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
)
endif()

set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}")

set(AVX_NAME "T_fallback")
set(AVX_TYPE "AVX0")
set(AVX_TYPE "AVX0" PARENT_SCOPE)
set(AVX_FLAGS "" PARENT_SCOPE)

# This is only supported on x86/x64, it is completely skipped and forced to T_fallback anywhere else
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64"))
Expand All @@ -54,11 +55,14 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} M
check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}")
endforeach()

string(REPLACE "T_" "" AVX_DISPLAY ${AVX_NAME})
message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} SSE type: ${AVX_DISPLAY}")
message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} AVX type: ${AVX_TYPE} (FLAGS: ${AVX_FLAG})")
set(AVX_TYPE ${AVX_TYPE})
set(AVX_TYPE ${AVX_TYPE} PARENT_SCOPE)
set(AVX_FLAG ${AVX_FLAG} PARENT_SCOPE)
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}")
else()
message(STATUS "SSE not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_NAME}")
set(AVX_NAME "T_fallback")
set(AVX_TYPE "T_fallback")
message(STATUS "AVX not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_TYPE}")
set(AVX_TYPE "AVX0")
set(AVX_FLAG "" PARENT_SCOPE)
set(AVX_TYPE "AVX0" PARENT_SCOPE)
endif()
136 changes: 136 additions & 0 deletions include/dpp/isa/avx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/************************************************************************************
*
* D++, A Lightweight C++ library for Discord
*
* Copyright 2021 Craig Edwards and D++ contributors
* (https://github.com/brainboxdotcc/DPP/graphs/contributors)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
************************************************************************************/
#pragma once

#if defined _MSC_VER || defined __GNUC__ || defined __clang__

#include <immintrin.h>

#ifdef max
#undef max
#endif
#ifdef min
#undef min
#endif

namespace dpp {

using avx_float = __m128;
using avx_int = __m128i;

/*
* @brief Extracts a 32-bit integer from a 128-bit AVX register.
* @param value The AVX register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-3).
* @return The extracted 32-bit integer.
*/
inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) {
switch (index) {
case 0: {
return _mm_extract_epi32(value, 0);
}
case 1: {
return _mm_extract_epi32(value, 1);
}
case 2: {
return _mm_extract_epi32(value, 2);
}
case 3: {
return _mm_extract_epi32(value, 3);
}
default: {
return _mm_extract_epi32(value, 0);
}
}
}

/**
* @brief A class for audio mixing operations using AVX instructions.
*/
class audio_mixer {
public:
/*
* @brief The number of 32-bit values per CPU register.
*/
inline static constexpr int32_t byte_blocks_per_register{ 4 };

/*
* @brief Stores values from a 128-bit AVX vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 128-bit AVX vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_int& values_to_store, value_type* storage_location) {
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_int32_from_avx(values_to_store, x));
}
}

/**
* @brief Specialization for gathering non-float values into an AVX register.
* @tparam value_type The type of values being gathered.
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX register containing gathered values.
*/
template<typename value_type> inline static avx_float gather_values(value_type* values) {
alignas(16) float new_array[byte_blocks_per_register]{};
for (size_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
}
return _mm_load_ps(new_array);
}

/**
* @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out.
* This version uses AVX instructions.
*
* @param data_in Pointer to the input array of int32_t values.
* @param data_out Pointer to the output array of int16_t values.
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };

current_samples_new = _mm_blendv_ps(_mm_max_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::min()))),
_mm_min_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::max()))),
_mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ));

store_values(_mm_cvtps_epi32(current_samples_new), data_out);
}

/**
* @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector.
* This version uses AVX instructions.
*
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) };
store_values(newValues, up_sampled_vector);
}
};

} // namespace dpp

#endif
Loading