Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MSVC Builds & Python Argument Parsing #181

Merged
merged 11 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,16 @@ if(${STRINGZILLA_BUILD_SHARED})
"SZ_USE_ARM_NEON=1"
"SZ_USE_ARM_SVE=1")
endif()

if (MSVC)
# Add dependencies for necessary runtime libraries in case of static linking
# This ensures that basic runtime functions are available:
# msvcrt.lib: Microsoft Visual C Runtime, required for basic C runtime functions on Windows.
# vcruntime.lib: Microsoft Visual C++ Runtime library for basic runtime functions.
# ucrt.lib: Universal C Runtime, necessary for linking basic C functions like I/O.
target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
endif()

endfunction()

define_shared(stringzilla_shared)
Expand All @@ -344,4 +354,6 @@ if(${STRINGZILLA_BUILD_SHARED})
"$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")


endif()
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ __Who is this for?__
<span style="color:#ABABAB;">arm:</span> <b>9.4</b> MB/s
</td>
<td align="center">
<code>uniform_int_distribution</code><br/>
<code>std::uniform_int_distribution</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>47.2</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>20.4</b> MB/s
</td>
Expand All @@ -193,7 +193,7 @@ __Who is this for?__
<tr>
<td align="center">⚪</td>
<td align="center">
<code>transform</code><br/>
<code>std::transform</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>3.81</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>2.65</b> GB/s
</td>
Expand Down
13 changes: 6 additions & 7 deletions c/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,21 +232,20 @@ static void sz_dispatch_table_init(void) {
}

#if defined(_MSC_VER)
#pragma section(".CRT$XCU", read)
__declspec(allocate(".CRT$XCU")) void (*_sz_dispatch_table_init)() = sz_dispatch_table_init;

BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
switch (forward_reason) {
case DLL_PROCESS_ATTACH: sz_dispatch_table_init(); return TRUE;
case DLL_PROCESS_ATTACH:
sz_dispatch_table_init(); // Ensure initialization
return TRUE;
case DLL_THREAD_ATTACH: return TRUE;
case DLL_THREAD_DETACH: return TRUE;
case DLL_PROCESS_DETACH: return TRUE;
}
}

#if SZ_AVOID_LIBC
BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
DllMain(hints, forward_reason, lp);
return TRUE;
}
#endif

#else
__attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
Expand Down
12 changes: 10 additions & 2 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
// operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
// Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
//
// - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
// - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
// - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 6 cycles latency, ports: 1*FP12
// - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p05
// - On Genoa: 1 cycle latency, ports: 1*FP0123
// - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 4 cycles latency, ports: 1*FP01
//
sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
Expand Down
Loading
Loading